diff options
author | Rodney W. Grimes <rgrimes@FreeBSD.org> | 1994-05-27 12:33:43 +0000 |
---|---|---|
committer | Rodney W. Grimes <rgrimes@FreeBSD.org> | 1994-05-27 12:33:43 +0000 |
commit | 9b50d9027575220cb6dd09b3e62f03f511e908b8 (patch) | |
tree | defc987843071f19a5891a97145437d43cba2af8 /usr.bin/compress | |
parent | efd31c5952bb9bb2485c109382100127424f8611 (diff) | |
download | src-9b50d9027575220cb6dd09b3e62f03f511e908b8.tar.gz src-9b50d9027575220cb6dd09b3e62f03f511e908b8.zip |
Notes
Diffstat (limited to 'usr.bin/compress')
-rw-r--r-- | usr.bin/compress/Makefile | 12 | ||||
-rw-r--r-- | usr.bin/compress/compress.1 | 172 | ||||
-rw-r--r-- | usr.bin/compress/compress.c | 444 | ||||
-rw-r--r-- | usr.bin/compress/doc/NOTES | 139 | ||||
-rw-r--r-- | usr.bin/compress/doc/README | 283 | ||||
-rw-r--r-- | usr.bin/compress/doc/revision.log | 116 | ||||
-rw-r--r-- | usr.bin/compress/zcat.sh | 37 | ||||
-rw-r--r-- | usr.bin/compress/zopen.3 | 139 | ||||
-rw-r--r-- | usr.bin/compress/zopen.c | 740 |
9 files changed, 2082 insertions, 0 deletions
diff --git a/usr.bin/compress/Makefile b/usr.bin/compress/Makefile new file mode 100644 index 000000000000..6a0683018cb9 --- /dev/null +++ b/usr.bin/compress/Makefile @@ -0,0 +1,12 @@ +# @(#)Makefile 8.2 (Berkeley) 4/17/94 + +PROG= compress +SRCS= compress.c zopen.c +LINKS= ${BINDIR}/compress ${BINDIR}/uncompress +MLINKS= compress.1 uncompress.1 compress.1 zcat.1 + +afterinstall: + install -c -o ${BINOWN} -g ${BINGRP} -m ${BINMODE} \ + ${.CURDIR}/zcat.sh ${DESTDIR}/usr/bin/zcat + +.include <bsd.prog.mk> diff --git a/usr.bin/compress/compress.1 b/usr.bin/compress/compress.1 new file mode 100644 index 000000000000..74b7348cf928 --- /dev/null +++ b/usr.bin/compress/compress.1 @@ -0,0 +1,172 @@ +.\" Copyright (c) 1986, 1990, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" This code is derived from software contributed to Berkeley by +.\" James A. Woods, derived from original work by Spencer Thomas +.\" and Joseph Orost. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)compress.1 8.2 (Berkeley) 4/18/94 +.\" +.Dd April 18, 1994 +.Dt COMPRESS 1 +.Os BSD 4.3 +.Sh NAME +.Nm compress , +.Nm uncompress , +.Nm zcat +.Nd compress and expand data +.Sh SYNOPSIS +.Nm compress +.Op Fl cfv +.Op Fl b Ar bits +.Op Ar +.Nm uncompress +.Op Fl cfv +.Op Ar +.Nm zcat +.Op Ar +.Sh DESCRIPTION +.Nm Compress +reduces the size of the named files using adaptive Lempel-Ziv coding. +Each +.Ar file +is renamed to the same name plus the extension +.Dq .Z . +As many of the modification time, access time, file flags, file mode, +user ID, and group ID as allowed by permissions are retained in the +new file. +If compression would not reduce the size of a +.Ar file , +the file is ignored. +.Pp +.Nm Uncompress +restores the compressed files to their original form, renaming the +files by deleting the +.Dq .Z +extension. +.Pp +.Nm Zcat +is an alias for +.Dq "uncompress -c" . +.Pp +If renaming the files would cause files to be overwritten and the standard +input device is a terminal, the user is prompted (on the standard error +output) for confirmation. +If prompting is not possible or confirmation is not received, the files +are not overwritten. +.Pp +If no files are specified, the standard input is compressed or uncompressed +to the standard output. +If either the input and output files are not regular files, the checks for +reduction in size and file overwriting are not performed, the input file is +not removed, and the attributes of the input file are not retained. +.Pp +The options are as follows: +.Bl -tag -width Ds +.It Fl b +Specify the +.Ar bits +code limit (see below). +.It Fl c +Compressed or uncompressed output is written to the standard output. +No files are modified. +.It Fl f +Force compression of +.Ar file , +even if it is not actually reduced in size. +Additionally, files are overwritten without prompting for confirmation. +.It Fl v +Print the percentage reduction of each file. +.El +.Pp +.Nm Compress +uses a modified Lempel-Ziv algorithm. +Common substrings in the file are first replaced by 9-bit codes 257 and up. +When code 512 is reached, the algorithm switches to 10-bit codes and +continues to use more bits until the +limit specified by the +.Fl b +flag is reached (the default is 16). +.Ar Bits +must be between 9 and 16. +.Pp +After the +.Ar bits +limit is reached, +.Nm compress +periodically checks the compression ratio. +If it is increasing, +.Nm compress +continues to use the existing code dictionary. +However, if the compression ratio decreases, +.Nm compress +discards the table of substrings and rebuilds it from scratch. This allows +the algorithm to adapt to the next "block" of the file. +.Pp +The +.Fl b +flag is omitted for +.Ar uncompress +since the +.Ar bits +parameter specified during compression +is encoded within the output, along with +a magic number to ensure that neither decompression of random data nor +recompression of compressed data is attempted. +.Pp +.ne 8 +The amount of compression obtained depends on the size of the +input, the number of +.Ar bits +per code, and the distribution of common substrings. +Typically, text such as source code or English is reduced by 50\-60%. +Compression is generally much better than that achieved by Huffman +coding (as used in the historical command pack), or adaptive Huffman +coding (as used in the historical command compact), and takes less +time to compute. +.Pp +The +.Nm compress +utility exits 0 on success, and >0 if an error occurs. +.Sh SEE ALSO +.Rs +.%A Welch, Terry A. +.%D June, 1984 +.%T "A Technique for High Performance Data Compression" +.%J "IEEE Computer" +.%V 17:6 +.%P pp. 8-19 +.Re +.Sh HISTORY +The +.Nm +command appeared in +.Bx 4.3 . diff --git a/usr.bin/compress/compress.c b/usr.bin/compress/compress.c new file mode 100644 index 000000000000..d66d22433dcb --- /dev/null +++ b/usr.bin/compress/compress.c @@ -0,0 +1,444 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef lint +static char copyright[] = +"@(#) Copyright (c) 1992, 1993\n\ + The Regents of the University of California. All rights reserved.\n"; +#endif /* not lint */ + +#ifndef lint +static char sccsid[] = "@(#)compress.c 8.2 (Berkeley) 1/7/94"; +#endif /* not lint */ + +#include <sys/param.h> +#include <sys/time.h> +#include <sys/stat.h> + +#include <err.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#ifdef __STDC__ +#include <stdarg.h> +#else +#include <varargs.h> +#endif + +void compress __P((char *, char *, int)); +void cwarn __P((const char *, ...)); +void cwarnx __P((const char *, ...)); +void decompress __P((char *, char *, int)); +int permission __P((char *)); +void setfile __P((char *, struct stat *)); +void usage __P((int)); + +int eval, force, verbose; + +int +main(argc, argv) + int argc; + char *argv[]; +{ + enum {COMPRESS, DECOMPRESS} style; + size_t len; + int bits, cat, ch; + char *p, newname[MAXPATHLEN]; + + if ((p = rindex(argv[0], '/')) == NULL) + p = argv[0]; + else + ++p; + if (!strcmp(p, "uncompress")) + style = DECOMPRESS; + else if (!strcmp(p, "compress")) + style = COMPRESS; + else + errx(1, "unknown program name"); + + bits = cat = 0; + while ((ch = getopt(argc, argv, "b:cdfv")) != EOF) + switch(ch) { + case 'b': + bits = strtol(optarg, &p, 10); + if (*p) + errx(1, "illegal bit count -- %s", optarg); + break; + case 'c': + cat = 1; + break; + case 'd': /* Backward compatible. */ + style = DECOMPRESS; + break; + case 'f': + force = 1; + break; + case 'v': + verbose = 1; + break; + case '?': + default: + usage(style == COMPRESS); + } + argc -= optind; + argv += optind; + + if (argc == 0) { + switch(style) { + case COMPRESS: + (void)compress("/dev/stdin", "/dev/stdout", bits); + break; + case DECOMPRESS: + (void)decompress("/dev/stdin", "/dev/stdout", bits); + break; + } + exit (eval); + } + + if (cat == 1 && argc > 1) + errx(1, "the -c option permits only a single file argument"); + + for (; *argv; ++argv) + switch(style) { + case COMPRESS: + if (cat) { + compress(*argv, "/dev/stdout", bits); + break; + } + if ((p = rindex(*argv, '.')) != NULL && + !strcmp(p, ".Z")) { + cwarnx("%s: name already has trailing .Z", + *argv); + break; + } + len = strlen(*argv); + if (len > sizeof(newname) - 3) { + cwarnx("%s: name too long", *argv); + break; + } + memmove(newname, *argv, len); + newname[len] = '.'; + newname[len + 1] = 'Z'; + newname[len + 2] = '\0'; + compress(*argv, newname, bits); + break; + case DECOMPRESS: + len = strlen(*argv); + if ((p = rindex(*argv, '.')) == NULL || + strcmp(p, ".Z")) { + if (len > sizeof(newname) - 3) { + cwarnx("%s: name too long", *argv); + break; + } + memmove(newname, *argv, len); + newname[len] = '.'; + newname[len + 1] = 'Z'; + newname[len + 2] = '\0'; + decompress(newname, + cat ? "/dev/stdout" : *argv, bits); + } else { + if (len - 2 > sizeof(newname) - 1) { + cwarnx("%s: name too long", *argv); + break; + } + memmove(newname, *argv, len - 2); + newname[len - 2] = '\0'; + decompress(*argv, + cat ? "/dev/stdout" : newname, bits); + } + break; + } + exit (eval); +} + +void +compress(in, out, bits) + char *in, *out; + int bits; +{ + register int nr; + struct stat isb, sb; + FILE *ifp, *ofp; + int exists, isreg, oreg; + u_char buf[1024]; + + exists = !stat(out, &sb); + if (!force && exists && S_ISREG(sb.st_mode) && !permission(out)) + return; + isreg = oreg = !exists || S_ISREG(sb.st_mode); + + ifp = ofp = NULL; + if ((ifp = fopen(in, "r")) == NULL) { + cwarn("%s", in); + return; + } + if (stat(in, &isb)) { /* DON'T FSTAT! */ + cwarn("%s", in); + goto err; + } + if (!S_ISREG(isb.st_mode)) + isreg = 0; + + if ((ofp = zopen(out, "w", bits)) == NULL) { + cwarn("%s", out); + goto err; + } + while ((nr = fread(buf, 1, sizeof(buf), ifp)) != 0) + if (fwrite(buf, 1, nr, ofp) != nr) { + cwarn("%s", out); + goto err; + } + + if (ferror(ifp) || fclose(ifp)) { + cwarn("%s", in); + goto err; + } + ifp = NULL; + + if (fclose(ofp)) { + cwarn("%s", out); + goto err; + } + ofp = NULL; + + if (isreg) { + if (stat(out, &sb)) { + cwarn("%s", out); + goto err; + } + + if (!force && sb.st_size >= isb.st_size) { + if (verbose) + (void)printf("%s: file would grow; left unmodified\n", in); + if (unlink(out)) + cwarn("%s", out); + goto err; + } + + setfile(out, &isb); + + if (unlink(in)) + cwarn("%s", in); + + if (verbose) { + (void)printf("%s: ", out); + if (isb.st_size > sb.st_size) + (void)printf("%.0f%% compression\n", + ((float)sb.st_size / isb.st_size) * 100.0); + else + (void)printf("%.0f%% expansion\n", + ((float)isb.st_size / sb.st_size) * 100.0); + } + } + return; + +err: if (ofp) { + if (oreg) + (void)unlink(out); + (void)fclose(ofp); + } + if (ifp) + (void)fclose(ifp); +} + +void +decompress(in, out, bits) + char *in, *out; + int bits; +{ + register int nr; + struct stat sb; + FILE *ifp, *ofp; + int exists, isreg, oreg; + u_char buf[1024]; + + exists = !stat(out, &sb); + if (!force && exists && S_ISREG(sb.st_mode) && !permission(out)) + return; + isreg = oreg = !exists || S_ISREG(sb.st_mode); + + ifp = ofp = NULL; + if ((ofp = fopen(out, "w")) == NULL) { + cwarn("%s", out); + return; + } + + if ((ifp = zopen(in, "r", bits)) == NULL) { + cwarn("%s", in); + goto err; + } + if (stat(in, &sb)) { + cwarn("%s", in); + goto err; + } + if (!S_ISREG(sb.st_mode)) + isreg = 0; + + while ((nr = fread(buf, 1, sizeof(buf), ifp)) != 0) + if (fwrite(buf, 1, nr, ofp) != nr) { + cwarn("%s", out); + goto err; + } + + if (ferror(ifp) || fclose(ifp)) { + cwarn("%s", in); + goto err; + } + ifp = NULL; + + if (fclose(ofp)) { + cwarn("%s", out); + goto err; + } + + if (isreg) { + setfile(out, &sb); + + if (unlink(in)) + cwarn("%s", in); + } + return; + +err: if (ofp) { + if (oreg) + (void)unlink(out); + (void)fclose(ofp); + } + if (ifp) + (void)fclose(ifp); +} + +void +setfile(name, fs) + char *name; + register struct stat *fs; +{ + static struct timeval tv[2]; + + fs->st_mode &= S_ISUID|S_ISGID|S_IRWXU|S_IRWXG|S_IRWXO; + + TIMESPEC_TO_TIMEVAL(&tv[0], &fs->st_atimespec); + TIMESPEC_TO_TIMEVAL(&tv[1], &fs->st_mtimespec); + if (utimes(name, tv)) + cwarn("utimes: %s", name); + + /* + * Changing the ownership probably won't succeed, unless we're root + * or POSIX_CHOWN_RESTRICTED is not set. Set uid/gid before setting + * the mode; current BSD behavior is to remove all setuid bits on + * chown. If chown fails, lose setuid/setgid bits. + */ + if (chown(name, fs->st_uid, fs->st_gid)) { + if (errno != EPERM) + cwarn("chown: %s", name); + fs->st_mode &= ~(S_ISUID|S_ISGID); + } + if (chmod(name, fs->st_mode)) + cwarn("chown: %s", name); + + if (chflags(name, fs->st_flags)) + cwarn("chflags: %s", name); +} + +int +permission(fname) + char *fname; +{ + int ch, first; + + if (!isatty(fileno(stderr))) + return (0); + (void)fprintf(stderr, "overwrite %s? ", fname); + first = ch = getchar(); + while (ch != '\n' && ch != EOF) + ch = getchar(); + return (first == 'y'); +} + +void +usage(iscompress) + int iscompress; +{ + if (iscompress) + (void)fprintf(stderr, + "usage: compress [-cfv] [-b bits] [file ...]\n"); + else + (void)fprintf(stderr, + "usage: uncompress [-c] [-b bits] [file ...]\n"); + exit(1); +} + +void +#if __STDC__ +cwarnx(const char *fmt, ...) +#else +cwarnx(fmt, va_alist) + int eval; + const char *fmt; + va_dcl +#endif +{ + va_list ap; +#if __STDC__ + va_start(ap, fmt); +#else + va_start(ap); +#endif + vwarnx(fmt, ap); + va_end(ap); + eval = 1; +} + +void +#if __STDC__ +cwarn(const char *fmt, ...) +#else +cwarn(fmt, va_alist) + int eval; + const char *fmt; + va_dcl +#endif +{ + va_list ap; +#if __STDC__ + va_start(ap, fmt); +#else + va_start(ap); +#endif + vwarn(fmt, ap); + va_end(ap); + eval = 1; +} diff --git a/usr.bin/compress/doc/NOTES b/usr.bin/compress/doc/NOTES new file mode 100644 index 000000000000..7c28c9c22b4f --- /dev/null +++ b/usr.bin/compress/doc/NOTES @@ -0,0 +1,139 @@ +From: James A. Woods <jaw@eos.arc.nasa.gov> + +>From vn Fri Dec 2 18:05:27 1988 +Subject: Re: Looking for C source for RSA +Newsgroups: sci.crypt + +# Illegitimi noncarborundum + +Patents are a tar pit. + +A good case can be made that most are just a license to sue, and nothing +is illegal until a patent is upheld in court. + +For example, if you receive netnews by means other than 'nntp', +these very words are being modulated by 'compress', +a variation on the patented Lempel-Ziv-Welch algorithm. + +Original Ziv-Lempel is patent number 4,464,650, and the more powerful +LZW method is #4,558,302. Yet despite any similarities between 'compress' +and LZW (the public-domain 'compress' code was designed and given to the +world before the ink on the Welch patent was dry), no attorneys from Sperry +(the assignee) have asked you to unplug your Usenet connection. + +Why? I can't speak for them, but it is possible the claims are too broad, +or, just as bad, not broad enough. ('compress' does things not mentioned +in the Welch patent.) Maybe they realize that they can commercialize +LZW better by selling hardware implementations rather than by licensing +software. Again, the LZW software delineated in the patent is *not* +the same as that of 'compress'. + +At any rate, court-tested software patents are a different animal; +corporate patents in a portfolio are usually traded like baseball cards +to shut out small fry rather than actually be defended before +non-technical juries. Perhaps RSA will undergo this test successfully, +although the grant to "exclude others from making, using, or selling" +the invention would then only apply to the U.S. (witness the +Genentech patent of the TPA molecule in the U.S. but struck down +in Great Britain as too broad.) + +The concept is still exotic for those who learned in school the rule of thumb +that one may patent "apparatus" but not an "idea". +Apparently this all changed in Diamond v. Diehr (1981) when the U. S. Supreme +Court reversed itself. + +Scholars should consult the excellent article in the Washington and Lee +Law Review (fall 1984, vol. 41, no. 4) by Anthony and Colwell for a +comprehensive survey of an area which will remain murky for some time. + +Until the dust clears, how you approach ideas which are patented depends +on how paranoid you are of a legal onslaught. Arbitrary? Yes. But +the patent bar the the CCPA (Court of Customs and Patent Appeals) +thanks you for any uncertainty as they, at least, stand to gain +from any trouble. + +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +From: James A. Woods <jaw@eos.arc.nasa.gov> +Subject: Re: Looking for C source for RSA (actually 'compress' patents) + + In article <2042@eos.UUCP> you write: + >The concept is still exotic for those who learned in school the rule of thumb + >that one may patent "apparatus" but not an "idea". + +A rule of thumb that has never been completely valid, as any chemical +engineer can tell you. (Chemical processes were among the earliest patents, +as I recall.) + + ah yes -- i date myself when relaying out-of-date advice from elderly + attorneys who don't even specialize in patents. one other interesting + class of patents include the output of optical lens design programs, + which yield formulae which can then fairly directly can be molded + into glass. although there are restrictions on patenting equations, + the "embedded systems" seem to fly past the legal gauntlets. + + anyway, i'm still learning about intellectual property law after + several conversations from a unisys (nee sperry) lawyer re 'compress'. + + it's more complicated than this, but they're letting (oral + communication only) software versions of 'compress' slide + as far as licensing fees go. this includes 'arc', 'stuffit', + and other commercial wrappers for 'compress'. yet they are + signing up licensees for hardware chips. hewlett-packard + supposedly has an active vlsi project, and unisys has + board-level lzw-based tape controllers. (to build lzw into + a disk controller would be strange, as you'd have to build + in a filesystem too!) + + it's byzantine + that unisys is in a tiff with hp regarding the patents, + after discovering some sort of "compress" button on some + hp terminal product. why? well, professor abraham lempel jumped + from being department chairman of computer science at technion in + israel to sperry (where he got the first patent), but then to work + at hewlett-packard on sabbatical. the second welch patent + is only weakly derivative of the first, so they want chip + licenses and hp relented. however, everyone agrees something + like the current unix implementation is the way to go with + software, so hp (and ucb) long ago asked spencer thomas and i to sign + off on copyright permission (although they didn't need to, it being pd). + lempel, hp, and unisys grumbles they can't make money off the + software since a good free implementation (not the best -- + i have more ideas!) escaped via usenet. (lempel's own pascal + code was apparently horribly slow.) + i don't follow the ibm 'arc' legal bickering; my impression + is that the pc folks are making money off the archiver/wrapper + look/feel of the thing [if ms-dos can be said to have a look and feel]. + + now where is telebit with the compress firmware? in a limbo + netherworld, probably, with sperry still welcoming outfits + to sign patent licenses, a common tactic to bring other small fry + into the fold. the guy who crammed 12-bit compess into the modem + there left. also what is transpiring with 'compress' and sys 5 rel 4? + beats me, but if sperry got a hold of them on these issues, + at&t would likely re-implement another algorithm if they + thought 'compress' infringes. needful to say, i don't think + it does after the abovementioned legal conversation. + my own beliefs on whether algorithms should be patentable at all + change with the weather. if the courts finally nail down + patent protection for algorithms, academic publication in + textbooks will be somewhat at odds with the engineering world, + where the textbook codes will simply be a big tease to get + money into the patent holder coffers... + + oh, if you implement lzw from the patent, you won't get + good rates because it doesn't mention adaptive table reset, + lack thereof being *the* serious deficiency of thomas' first version. + + now i know that patent law generally protects against independent + re-invention (like the 'xor' hash function pleasantly mentioned + in the patent [but not the paper]). + but the upshot is that if anyone ever wanted to sue us, + we're partially covered with + independently-developed twists, plus the fact that some of us work + in a bureacratic morass (as contractor to a public agency in my case). + + quite a mess, huh? i've wanted to tell someone this stuff + for a long time, for posterity if nothing else. + +james + diff --git a/usr.bin/compress/doc/README b/usr.bin/compress/doc/README new file mode 100644 index 000000000000..6803287c207a --- /dev/null +++ b/usr.bin/compress/doc/README @@ -0,0 +1,283 @@ + + @(#)README 8.1 (Berkeley) 6/9/93 + +Compress version 4.0 improvements over 3.0: + o compress() speedup (10-50%) by changing division hash to xor + o decompress() speedup (5-10%) + o Memory requirements reduced (3-30%) + o Stack requirements reduced to less than 4kb + o Removed 'Big+Fast' compress code (FBITS) because of compress speedup + o Portability mods for Z8000 and PC/XT (but not zeus 3.2) + o Default to 'quiet' mode + o Unification of 'force' flags + o Manual page overhaul + o Portability enhancement for M_XENIX + o Removed text on #else and #endif + o Added "-V" switch to print version and options + o Added #defines for SIGNED_COMPARE_SLOW + o Added Makefile and "usermem" program + o Removed all floating point computations + o New programs: [deleted] + +The "usermem" script attempts to determine the maximum process size. Some +editing of the script may be necessary (see the comments). [It should work +fine on 4.3 bsd.] If you can't get it to work at all, just create file +"USERMEM" containing the maximum process size in decimal. + +The following preprocessor symbols control the compilation of "compress.c": + + o USERMEM Maximum process memory on the system + o SACREDMEM Amount to reserve for other proceses + o SIGNED_COMPARE_SLOW Unsigned compare instructions are faster + o NO_UCHAR Don't use "unsigned char" types + o BITS Overrules default set by USERMEM-SACREDMEM + o vax Generate inline assembler + o interdata Defines SIGNED_COMPARE_SLOW + o M_XENIX Makes arrays < 65536 bytes each + o pdp11 BITS=12, NO_UCHAR + o z8000 BITS=12 + o pcxt BITS=12 + o BSD4_2 Allow long filenames ( > 14 characters) & + Call setlinebuf(stderr) + +The difference "usermem-sacredmem" determines the maximum BITS that can be +specified with the "-b" flag. + +memory: at least BITS +------ -- ----- ---- + 433,484 16 + 229,600 15 + 127,536 14 + 73,464 13 + 0 12 + +The default is BITS=16. + +The maximum bits can be overrulled by specifying "-DBITS=bits" at +compilation time. + +WARNING: files compressed on a large machine with more bits than allowed by +a version of compress on a smaller machine cannot be decompressed! Use the +"-b12" flag to generate a file on a large machine that can be uncompressed +on a 16-bit machine. + +The output of compress 4.0 is fully compatible with that of compress 3.0. +In other words, the output of compress 4.0 may be fed into uncompress 3.0 or +the output of compress 3.0 may be fed into uncompress 4.0. + +The output of compress 4.0 not compatible with that of +compress 2.0. However, compress 4.0 still accepts the output of +compress 2.0. To generate output that is compatible with compress +2.0, use the undocumented "-C" flag. + + -from mod.sources, submitted by vax135!petsd!joe (Joe Orost), 8/1/85 +-------------------------------- + +Enclosed is compress version 3.0 with the following changes: + +1. "Block" compression is performed. After the BITS run out, the + compression ratio is checked every so often. If it is decreasing, + the table is cleared and a new set of substrings are generated. + + This makes the output of compress 3.0 not compatible with that of + compress 2.0. However, compress 3.0 still accepts the output of + compress 2.0. To generate output that is compatible with compress + 2.0, use the undocumented "-C" flag. + +2. A quiet "-q" flag has been added for use by the news system. + +3. The character chaining has been deleted and the program now uses + hashing. This improves the speed of the program, especially + during decompression. Other speed improvements have been made, + such as using putc() instead of fwrite(). + +4. A large table is used on large machines when a relatively small + number of bits is specified. This saves much time when compressing + for a 16-bit machine on a 32-bit virtual machine. Note that the + speed improvement only occurs when the input file is > 30000 + characters, and the -b BITS is less than or equal to the cutoff + described below. + +Most of these changes were made by James A. Woods (ames!jaw). Thank you +James! + +To compile compress: + + cc -O -DUSERMEM=usermem -o compress compress.c + +Where "usermem" is the amount of physical user memory available (in bytes). +If any physical memory is to be reserved for other processes, put in +"-DSACREDMEM sacredmem", where "sacredmem" is the amount to be reserved. + +The difference "usermem-sacredmem" determines the maximum BITS that can be +specified, and the cutoff bits where the large+fast table is used. + +memory: at least BITS cutoff +------ -- ----- ---- ------ + 4,718,592 16 13 + 2,621,440 16 12 + 1,572,864 16 11 + 1,048,576 16 10 + 631,808 16 -- + 329,728 15 -- + 178,176 14 -- + 99,328 13 -- + 0 12 -- + +The default memory size is 750,000 which gives a maximum BITS=16 and no +large+fast table. + +The maximum bits can be overruled by specifying "-DBITS=bits" at +compilation time. + +If your machine doesn't support unsigned characters, define "NO_UCHAR" +when compiling. + +If your machine has "int" as 16-bits, define "SHORT_INT" when compiling. + +After compilation, move "compress" to a standard executable location, such +as /usr/local. Then: + cd /usr/local + ln compress uncompress + ln compress zcat + +On machines that have a fixed stack size (such as Perkin-Elmer), set the +stack to at least 12kb. ("setstack compress 12" on Perkin-Elmer). + +Next, install the manual (compress.l). + cp compress.l /usr/man/manl + cd /usr/man/manl + ln compress.l uncompress.l + ln compress.l zcat.l + + - or - + + cp compress.l /usr/man/man1/compress.1 + cd /usr/man/man1 + ln compress.1 uncompress.1 + ln compress.1 zcat.1 + + regards, + petsd!joe + +Here is a note from the net: + +>From hplabs!pesnta!amd!turtlevax!ken Sat Jan 5 03:35:20 1985 +Path: ames!hplabs!pesnta!amd!turtlevax!ken +From: ken@turtlevax.UUCP (Ken Turkowski) +Newsgroups: net.sources +Subject: Re: Compress release 3.0 : sample Makefile +Organization: CADLINC, Inc. @ Menlo Park, CA + +In the compress 3.0 source recently posted to mod.sources, there is a +#define variable which can be set for optimum performance on a machine +with a large amount of memory. A program (usermem) to calculate the +useable amount of physical user memory is enclosed, as well as a sample +4.2bsd Vax Makefile for compress. + +Here is the README file from the previous version of compress (2.0): + +>Enclosed is compress.c version 2.0 with the following bugs fixed: +> +>1. The packed files produced by compress are different on different +> machines and dependent on the vax sysgen option. +> The bug was in the different byte/bit ordering on the +> various machines. This has been fixed. +> +> This version is NOT compatible with the original vax posting +> unless the '-DCOMPATIBLE' option is specified to the C +> compiler. The original posting has a bug which I fixed, +> causing incompatible files. I recommend you NOT to use this +> option unless you already have a lot of packed files from +> the original posting by thomas. +>2. The exit status is not well defined (on some machines) causing the +> scripts to fail. +> The exit status is now 0,1 or 2 and is documented in +> compress.l. +>3. The function getopt() is not available in all C libraries. +> The function getopt() is no longer referenced by the +> program. +>4. Error status is not being checked on the fwrite() and fflush() calls. +> Fixed. +> +>The following enhancements have been made: +> +>1. Added facilities of "compact" into the compress program. "Pack", +> "Unpack", and "Pcat" are no longer required (no longer supplied). +>2. Installed work around for C compiler bug with "-O". +>3. Added a magic number header (\037\235). Put the bits specified +> in the file. +>4. Added "-f" flag to force overwrite of output file. +>5. Added "-c" flag and "zcat" program. 'ln compress zcat' after you +> compile. +>6. The 'uncompress' script has been deleted; simply +> 'ln compress uncompress' after you compile and it will work. +>7. Removed extra bit masking for machines that support unsigned +> characters. If your machine doesn't support unsigned characters, +> define "NO_UCHAR" when compiling. +> +>Compile "compress.c" with "-O -o compress" flags. Move "compress" to a +>standard executable location, such as /usr/local. Then: +> cd /usr/local +> ln compress uncompress +> ln compress zcat +> +>On machines that have a fixed stack size (such as Perkin-Elmer), set the +>stack to at least 12kb. ("setstack compress 12" on Perkin-Elmer). +> +>Next, install the manual (compress.l). +> cp compress.l /usr/man/manl - or - +> cp compress.l /usr/man/man1/compress.1 +> +>Here is the README that I sent with my first posting: +> +>>Enclosed is a modified version of compress.c, along with scripts to make it +>>run identically to pack(1), unpack(1), an pcat(1). Here is what I +>>(petsd!joe) and a colleague (petsd!peora!srd) did: +>> +>>1. Removed VAX dependencies. +>>2. Changed the struct to separate arrays; saves mucho memory. +>>3. Did comparisons in unsigned, where possible. (Faster on Perkin-Elmer.) +>>4. Sorted the character next chain and changed the search to stop +>>prematurely. This saves a lot on the execution time when compressing. +>> +>>This version is totally compatible with the original version. Even though +>>lint(1) -p has no complaints about compress.c, it won't run on a 16-bit +>>machine, due to the size of the arrays. +>> +>>Here is the README file from the original author: +>> +>>>Well, with all this discussion about file compression (for news batching +>>>in particular) going around, I decided to implement the text compression +>>>algorithm described in the June Computer magazine. The author claimed +>>>blinding speed and good compression ratios. It's certainly faster than +>>>compact (but, then, what wouldn't be), but it's also the same speed as +>>>pack, and gets better compression than both of them. On 350K bytes of +>>>unix-wizards, compact took about 8 minutes of CPU, pack took about 80 +>>>seconds, and compress (herein) also took 80 seconds. But, compact and +>>>pack got about 30% compression, whereas compress got over 50%. So, I +>>>decided I had something, and that others might be interested, too. +>>> +>>>As is probably true of compact and pack (although I haven't checked), +>>>the byte order within a word is probably relevant here, but as long as +>>>you stay on a single machine type, you should be ok. (Can anybody +>>>elucidate on this?) There are a couple of asm's in the code (extv and +>>>insv instructions), so anyone porting it to another machine will have to +>>>deal with this anyway (and could probably make it compatible with Vax +>>>byte order at the same time). Anyway, I've linted the code (both with +>>>and without -p), so it should run elsewhere. Note the longs in the +>>>code, you can take these out if you reduce BITS to <= 15. +>>> +>>>Have fun, and as always, if you make good enhancements, or bug fixes, +>>>I'd like to see them. +>>> +>>>=Spencer (thomas@utah-20, {harpo,hplabs,arizona}!utah-cs!thomas) +>> +>> regards, +>> joe +>> +>>-- +>>Full-Name: Joseph M. Orost +>>UUCP: ..!{decvax,ucbvax,ihnp4}!vax135!petsd!joe +>>US Mail: MS 313; Perkin-Elmer; 106 Apple St; Tinton Falls, NJ 07724 +>>Phone: (201) 870-5844 diff --git a/usr.bin/compress/doc/revision.log b/usr.bin/compress/doc/revision.log new file mode 100644 index 000000000000..b1d8b24cc4f0 --- /dev/null +++ b/usr.bin/compress/doc/revision.log @@ -0,0 +1,116 @@ +/* + * $Header: compress.c,v 4.0 85/07/30 12:50:00 joe Release $ + * $Log: compress.c,v $ + * Revision 4.0 85/07/30 12:50:00 joe + * Removed ferror() calls in output routine on every output except first. + * Prepared for release to the world. + * + * Revision 3.6 85/07/04 01:22:21 joe + * Remove much wasted storage by overlaying hash table with the tables + * used by decompress: tab_suffix[1<<BITS], stack[8000]. Updated USERMEM + * computations. Fixed dump_tab() DEBUG routine. + * + * Revision 3.5 85/06/30 20:47:21 jaw + * Change hash function to use exclusive-or. Rip out hash cache. These + * speedups render the megamemory version defunct, for now. Make decoder + * stack global. Parts of the RCS trunks 2.7, 2.6, and 2.1 no longer apply. + * + * Revision 3.4 85/06/27 12:00:00 ken + * Get rid of all floating-point calculations by doing all compression ratio + * calculations in fixed point. + * + * Revision 3.3 85/06/24 21:53:24 joe + * Incorporate portability suggestion for M_XENIX. Got rid of text on #else + * and #endif lines. Cleaned up #ifdefs for vax and interdata. + * + * Revision 3.2 85/06/06 21:53:24 jaw + * Incorporate portability suggestions for Z8000, IBM PC/XT from mailing list. + * Default to "quiet" output (no compression statistics). + * + * Revision 3.1 85/05/12 18:56:13 jaw + * Integrate decompress() stack speedups (from early pointer mods by McKie). + * Repair multi-file USERMEM gaffe. Unify 'force' flags to mimic semantics + * of SVR2 'pack'. Streamline block-compress table clear logic. Increase + * output byte count by magic number size. + * + * Revision 3.0 84/11/27 11:50:00 petsd!joe + * Set HSIZE depending on BITS. Set BITS depending on USERMEM. Unrolled + * loops in clear routines. Added "-C" flag for 2.0 compatibility. Used + * unsigned compares on Perkin-Elmer. Fixed foreground check. + * + * Revision 2.7 84/11/16 19:35:39 ames!jaw + * Cache common hash codes based on input statistics; this improves + * performance for low-density raster images. Pass on #ifdef bundle + * from Turkowski. + * + * Revision 2.6 84/11/05 19:18:21 ames!jaw + * Vary size of hash tables to reduce time for small files. + * Tune PDP-11 hash function. + * + * Revision 2.5 84/10/30 20:15:14 ames!jaw + * Junk chaining; replace with the simpler (and, on the VAX, faster) + * double hashing, discussed within. Make block compression standard. + * + * Revision 2.4 84/10/16 11:11:11 ames!jaw + * Introduce adaptive reset for block compression, to boost the rate + * another several percent. (See mailing list notes.) + * + * Revision 2.3 84/09/22 22:00:00 petsd!joe + * Implemented "-B" block compress. Implemented REVERSE sorting of tab_next. + * Bug fix for last bits. Changed fwrite to putchar loop everywhere. + * + * Revision 2.2 84/09/18 14:12:21 ames!jaw + * Fold in news changes, small machine typedef from thomas, + * #ifdef interdata from joe. + * + * Revision 2.1 84/09/10 12:34:56 ames!jaw + * Configured fast table lookup for 32-bit machines. + * This cuts user time in half for b <= FBITS, and is useful for news batching + * from VAX to PDP sites. Also sped up decompress() [fwrite->putc] and + * added signal catcher [plus beef in writeerr()] to delete effluvia. + * + * Revision 2.0 84/08/28 22:00:00 petsd!joe + * Add check for foreground before prompting user. Insert maxbits into + * compressed file. Force file being uncompressed to end with ".Z". + * Added "-c" flag and "zcat". Prepared for release. + * + * Revision 1.10 84/08/24 18:28:00 turtlevax!ken + * Will only compress regular files (no directories), added a magic number + * header (plus an undocumented -n flag to handle old files without headers), + * added -f flag to force overwriting of possibly existing destination file, + * otherwise the user is prompted for a response. Will tack on a .Z to a + * filename if it doesn't have one when decompressing. Will only replace + * file if it was compressed. + * + * Revision 1.9 84/08/16 17:28:00 turtlevax!ken + * Removed scanargs(), getopt(), added .Z extension and unlimited number of + * filenames to compress. Flags may be clustered (-Ddvb12) or separated + * (-D -d -v -b 12), or combination thereof. Modes and other status is + * copied with copystat(). -O bug for 4.2 seems to have disappeared with + * 1.8. + * + * Revision 1.8 84/08/09 23:15:00 joe + * Made it compatible with vax version, installed jim's fixes/enhancements + * + * Revision 1.6 84/08/01 22:08:00 joe + * Sped up algorithm significantly by sorting the compress chain. + * + * Revision 1.5 84/07/13 13:11:00 srd + * Added C version of vax asm routines. Changed structure to arrays to + * save much memory. Do unsigned compares where possible (faster on + * Perkin-Elmer) + * + * Revision 1.4 84/07/05 03:11:11 thomas + * Clean up the code a little and lint it. (Lint complains about all + * the regs used in the asm, but I'm not going to "fix" this.) + * + * Revision 1.3 84/07/05 02:06:54 thomas + * Minor fixes. + * + * Revision 1.2 84/07/05 00:27:27 thomas + * Add variable bit length output. + * + */ + +static char rcs_ident[] = + "$Header: compress.c,v 4.0 85/07/30 12:50:00 joe Release $"; diff --git a/usr.bin/compress/zcat.sh b/usr.bin/compress/zcat.sh new file mode 100644 index 000000000000..c4931e486bb0 --- /dev/null +++ b/usr.bin/compress/zcat.sh @@ -0,0 +1,37 @@ +#!/bin/sh - +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)zcat.sh 8.1 (Berkeley) 6/6/93 +# + +uncompress -c $* diff --git a/usr.bin/compress/zopen.3 b/usr.bin/compress/zopen.3 new file mode 100644 index 000000000000..853462f5057a --- /dev/null +++ b/usr.bin/compress/zopen.3 @@ -0,0 +1,139 @@ +.\" Copyright (c) 1992, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)zopen.3 8.1 (Berkeley) 6/9/93 +.\" +.Dd June 9, 1993 +.Dt ZOPEN 3 +.Os +.Sh NAME +.Nm zopen +.Nd compressed stream open function +.Sh SYNOPSIS +.Fd #include <stdio.h> +.Ft FILE * +.Fn zopen "const char *path" "const char *mode" "int bits" +.Sh DESCRIPTION +The +.Fn zopen +function +opens the compressed file whose name is the string pointed to by +.Fa path +and associates a stream with it. +.Pp +The argument +.Fa mode +points to one of the following one-character strings: +.Bl -tag -width indent +.It Dq Li r +Open compressed file for reading. +The stream is positioned at the beginning of the file. +.It Dq Li w +Truncate file to zero length or create compressed file for writing. +The stream is positioned at the beginning of the file. +.El +.Pp +Any created files will have mode +.Pf \\*q Dv S_IRUSR +\&| +.Dv S_IWUSR +\&| +.Dv S_IRGRP +\&| +.Dv S_IWGRP +\&| +.Dv S_IROTH +\&| +.Dv S_IWOTH Ns \\*q +.Pq Li 0666 , +as modified by the process' +umask value (see +.Xr umask 2 ) . +.Pp +Files may only be read or written. +Seek operations are not allowed. +.Pp +The +.Fa bits +argument, if non-zero, is set to the bits code limit. +If zero, the default is 16. +See +.Fn compress 1 +for more information. +.Sh RETURN VALUES +Upon successful completion +.Fn zopen +returns a +.Tn FILE +pointer. +Otherwise, +.Dv NULL +is returned and the global variable +.Va errno +is set to indicate the error. +.Sh ERRORS +.Bl -tag -width [EINVAL] +.It Bq Er EINVAL +The +.Fa mode +or +.Fa bits +arguments specified to +.Fn zopen +were invalid. +.It Bq Er EFTYPE +The compressed file starts with an invalid header, or the compressed +file is compressed with more bits than can be handled. +.El +.Pp +The +.Fn zopen +function may also fail and set +.Va errno +for any of the errors specified for the routines +.Xr fopen 3 +or +.Xr funopen 3 . +.Sh SEE ALSO +.Xr compress 1 , +.Xr fopen 3 , +.Xr funopen 3 +.Sh HISTORY +The +.Nm zopen +function +first appeared in 4.4BSD. +.Sh BUGS +The +.Fn zopen +function +may not be portable to systems other than +.Bx . diff --git a/usr.bin/compress/zopen.c b/usr.bin/compress/zopen.c new file mode 100644 index 000000000000..b76fca6edada --- /dev/null +++ b/usr.bin/compress/zopen.c @@ -0,0 +1,740 @@ +/*- + * Copyright (c) 1985, 1986, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Diomidis Spinellis and James A. Woods, derived from original + * work by Spencer Thomas and Joseph Orost. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)zopen.c 8.1 (Berkeley) 6/27/93"; +#endif /* LIBC_SCCS and not lint */ + +/*- + * fcompress.c - File compression ala IEEE Computer, June 1984. + * + * Compress authors: + * Spencer W. Thomas (decvax!utah-cs!thomas) + * Jim McKie (decvax!mcvax!jim) + * Steve Davies (decvax!vax135!petsd!peora!srd) + * Ken Turkowski (decvax!decwrl!turtlevax!ken) + * James A. Woods (decvax!ihnp4!ames!jaw) + * Joe Orost (decvax!vax135!petsd!joe) + * + * Cleaned up and converted to library returning I/O streams by + * Diomidis Spinellis <dds@doc.ic.ac.uk>. + * + * zopen(filename, mode, bits) + * Returns a FILE * that can be used for read or write. The modes + * supported are only "r" and "w". Seeking is not allowed. On + * reading the file is decompressed, on writing it is compressed. + * The output is compatible with compress(1) with 16 bit tables. + * Any file produced by compress(1) can be read. + */ + +#include <sys/param.h> +#include <sys/stat.h> + +#include <ctype.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#define BITS 16 /* Default bits. */ +#define HSIZE 69001 /* 95% occupancy */ + +/* A code_int must be able to hold 2**BITS values of type int, and also -1. */ +typedef long code_int; +typedef long count_int; + +typedef u_char char_type; +static char_type magic_header[] = + {'\037', '\235'}; /* 1F 9D */ + +#define BIT_MASK 0x1f /* Defines for third byte of header. */ +#define BLOCK_MASK 0x80 + +/* + * Masks 0x40 and 0x20 are free. I think 0x20 should mean that there is + * a fourth header byte (for expansion). + */ +#define INIT_BITS 9 /* Initial number of bits/code. */ + +#define MAXCODE(n_bits) ((1 << (n_bits)) - 1) + +struct s_zstate { + FILE *zs_fp; /* File stream for I/O */ + char zs_mode; /* r or w */ + enum { + S_START, S_MIDDLE, S_EOF + } zs_state; /* State of computation */ + int zs_n_bits; /* Number of bits/code. */ + int zs_maxbits; /* User settable max # bits/code. */ + code_int zs_maxcode; /* Maximum code, given n_bits. */ + code_int zs_maxmaxcode; /* Should NEVER generate this code. */ + count_int zs_htab [HSIZE]; + u_short zs_codetab [HSIZE]; + code_int zs_hsize; /* For dynamic table sizing. */ + code_int zs_free_ent; /* First unused entry. */ + /* + * Block compression parameters -- after all codes are used up, + * and compression rate changes, start over. + */ + int zs_block_compress; + int zs_clear_flg; + long zs_ratio; + count_int zs_checkpoint; + int zs_offset; + long zs_in_count; /* Length of input. */ + long zs_bytes_out; /* Length of compressed output. */ + long zs_out_count; /* # of codes output (for debugging). */ + char_type zs_buf[BITS]; + union { + struct { + long zs_fcode; + code_int zs_ent; + code_int zs_hsize_reg; + int zs_hshift; + } w; /* Write paramenters */ + struct { + char_type *zs_stackp; + int zs_finchar; + code_int zs_code, zs_oldcode, zs_incode; + int zs_roffset, zs_size; + char_type zs_gbuf[BITS]; + } r; /* Read parameters */ + } u; +}; + +/* Definitions to retain old variable names */ +#define fp zs->zs_fp +#define zmode zs->zs_mode +#define state zs->zs_state +#define n_bits zs->zs_n_bits +#define maxbits zs->zs_maxbits +#define maxcode zs->zs_maxcode +#define maxmaxcode zs->zs_maxmaxcode +#define htab zs->zs_htab +#define codetab zs->zs_codetab +#define hsize zs->zs_hsize +#define free_ent zs->zs_free_ent +#define block_compress zs->zs_block_compress +#define clear_flg zs->zs_clear_flg +#define ratio zs->zs_ratio +#define checkpoint zs->zs_checkpoint +#define offset zs->zs_offset +#define in_count zs->zs_in_count +#define bytes_out zs->zs_bytes_out +#define out_count zs->zs_out_count +#define buf zs->zs_buf +#define fcode zs->u.w.zs_fcode +#define hsize_reg zs->u.w.zs_hsize_reg +#define ent zs->u.w.zs_ent +#define hshift zs->u.w.zs_hshift +#define stackp zs->u.r.zs_stackp +#define finchar zs->u.r.zs_finchar +#define code zs->u.r.zs_code +#define oldcode zs->u.r.zs_oldcode +#define incode zs->u.r.zs_incode +#define roffset zs->u.r.zs_roffset +#define size zs->u.r.zs_size +#define gbuf zs->u.r.zs_gbuf + +/* + * To save much memory, we overlay the table used by compress() with those + * used by decompress(). The tab_prefix table is the same size and type as + * the codetab. The tab_suffix table needs 2**BITS characters. We get this + * from the beginning of htab. The output stack uses the rest of htab, and + * contains characters. There is plenty of room for any possible stack + * (stack used to be 8000 characters). + */ + +#define htabof(i) htab[i] +#define codetabof(i) codetab[i] + +#define tab_prefixof(i) codetabof(i) +#define tab_suffixof(i) ((char_type *)(htab))[i] +#define de_stack ((char_type *)&tab_suffixof(1 << BITS)) + +#define CHECK_GAP 10000 /* Ratio check interval. */ + +/* + * the next two codes should not be changed lightly, as they must not + * lie within the contiguous general code space. + */ +#define FIRST 257 /* First free entry. */ +#define CLEAR 256 /* Table clear output code. */ + +static int cl_block __P((struct s_zstate *)); +static void cl_hash __P((struct s_zstate *, count_int)); +static code_int getcode __P((struct s_zstate *)); +static int output __P((struct s_zstate *, code_int)); +static int zclose __P((void *)); +static int zread __P((void *, char *, int)); +static int zwrite __P((void *, const char *, int)); + +/*- + * Algorithm from "A Technique for High Performance Data Compression", + * Terry A. Welch, IEEE Computer Vol 17, No 6 (June 1984), pp 8-19. + * + * Algorithm: + * Modified Lempel-Ziv method (LZW). Basically finds common + * substrings and replaces them with a variable size code. This is + * deterministic, and can be done on the fly. Thus, the decompression + * procedure needs no input table, but tracks the way the table was built. + */ + +/*- + * compress write + * + * Algorithm: use open addressing double hashing (no chaining) on the + * prefix code / next character combination. We do a variant of Knuth's + * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime + * secondary probe. Here, the modular division first probe is gives way + * to a faster exclusive-or manipulation. Also do block compression with + * an adaptive reset, whereby the code table is cleared when the compression + * ratio decreases, but after the table fills. The variable-length output + * codes are re-sized at this point, and a special CLEAR code is generated + * for the decompressor. Late addition: construct the table according to + * file size for noticeable speed improvement on small files. Please direct + * questions about this implementation to ames!jaw. + */ +static int +zwrite(cookie, wbp, num) + void *cookie; + const char *wbp; + int num; +{ + register code_int i; + register int c, disp; + struct s_zstate *zs; + const u_char *bp; + u_char tmp; + int count; + + if (num == 0) + return (0); + + zs = cookie; + count = num; + bp = (u_char *)wbp; + if (state == S_MIDDLE) + goto middle; + state = S_MIDDLE; + + maxmaxcode = 1L << BITS; + if (fwrite(magic_header, + sizeof(char), sizeof(magic_header), fp) != sizeof(magic_header)) + return (-1); + tmp = (u_char)(BITS | block_compress); + if (fwrite(&tmp, sizeof(char), sizeof(tmp), fp) != sizeof(tmp)) + return (-1); + + offset = 0; + bytes_out = 3; /* Includes 3-byte header mojo. */ + out_count = 0; + clear_flg = 0; + ratio = 0; + in_count = 1; + checkpoint = CHECK_GAP; + maxcode = MAXCODE(n_bits = INIT_BITS); + free_ent = ((block_compress) ? FIRST : 256); + + ent = *bp++; + --count; + + hshift = 0; + for (fcode = (long)hsize; fcode < 65536L; fcode *= 2L) + hshift++; + hshift = 8 - hshift; /* Set hash code range bound. */ + + hsize_reg = hsize; + cl_hash(zs, (count_int)hsize_reg); /* Clear hash table. */ + +middle: for (i = 0; count--;) { + c = *bp++; + in_count++; + fcode = (long)(((long)c << maxbits) + ent); + i = ((c << hshift) ^ ent); /* Xor hashing. */ + + if (htabof(i) == fcode) { + ent = codetabof(i); + continue; + } else if ((long)htabof(i) < 0) /* Empty slot. */ + goto nomatch; + disp = hsize_reg - i; /* Secondary hash (after G. Knott). */ + if (i == 0) + disp = 1; +probe: if ((i -= disp) < 0) + i += hsize_reg; + + if (htabof(i) == fcode) { + ent = codetabof(i); + continue; + } + if ((long)htabof(i) >= 0) + goto probe; +nomatch: if (output(zs, (code_int) ent) == -1) + return (-1); + out_count++; + ent = c; + if (free_ent < maxmaxcode) { + codetabof(i) = free_ent++; /* code -> hashtable */ + htabof(i) = fcode; + } else if ((count_int)in_count >= + checkpoint && block_compress) { + if (cl_block(zs) == -1) + return (-1); + } + } + return (num); +} + +static int +zclose(cookie) + void *cookie; +{ + struct s_zstate *zs; + int rval; + + zs = cookie; + if (zmode == 'w') { /* Put out the final code. */ + if (output(zs, (code_int) ent) == -1) { + (void)fclose(fp); + free(zs); + return (-1); + } + out_count++; + if (output(zs, (code_int) - 1) == -1) { + (void)fclose(fp); + free(zs); + return (-1); + } + } + rval = fclose(fp) == EOF ? -1 : 0; + free(zs); + return (rval); +} + +/*- + * Output the given code. + * Inputs: + * code: A n_bits-bit integer. If == -1, then EOF. This assumes + * that n_bits =< (long)wordsize - 1. + * Outputs: + * Outputs code to the file. + * Assumptions: + * Chars are 8 bits long. + * Algorithm: + * Maintain a BITS character long buffer (so that 8 codes will + * fit in it exactly). Use the VAX insv instruction to insert each + * code in turn. When the buffer fills up empty it and start over. + */ + +static char_type lmask[9] = + {0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00}; +static char_type rmask[9] = + {0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff}; + +static int +output(zs, ocode) + struct s_zstate *zs; + code_int ocode; +{ + register int bits, r_off; + register char_type *bp; + + r_off = offset; + bits = n_bits; + bp = buf; + if (ocode >= 0) { + /* Get to the first byte. */ + bp += (r_off >> 3); + r_off &= 7; + /* + * Since ocode is always >= 8 bits, only need to mask the first + * hunk on the left. + */ + *bp = (*bp & rmask[r_off]) | (ocode << r_off) & lmask[r_off]; + bp++; + bits -= (8 - r_off); + ocode >>= 8 - r_off; + /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */ + if (bits >= 8) { + *bp++ = ocode; + ocode >>= 8; + bits -= 8; + } + /* Last bits. */ + if (bits) + *bp = ocode; + offset += n_bits; + if (offset == (n_bits << 3)) { + bp = buf; + bits = n_bits; + bytes_out += bits; + if (fwrite(bp, sizeof(char), bits, fp) != bits) + return (-1); + bp += bits; + bits = 0; + offset = 0; + } + /* + * If the next entry is going to be too big for the ocode size, + * then increase it, if possible. + */ + if (free_ent > maxcode || (clear_flg > 0)) { + /* + * Write the whole buffer, because the input side won't + * discover the size increase until after it has read it. + */ + if (offset > 0) { + if (fwrite(buf, 1, n_bits, fp) != n_bits) + return (-1); + bytes_out += n_bits; + } + offset = 0; + + if (clear_flg) { + maxcode = MAXCODE(n_bits = INIT_BITS); + clear_flg = 0; + } else { + n_bits++; + if (n_bits == maxbits) + maxcode = maxmaxcode; + else + maxcode = MAXCODE(n_bits); + } + } + } else { + /* At EOF, write the rest of the buffer. */ + if (offset > 0) { + offset = (offset + 7) / 8; + if (fwrite(buf, 1, offset, fp) != offset) + return (-1); + bytes_out += offset; + } + offset = 0; + } + return (0); +} + +/* + * Decompress read. This routine adapts to the codes in the file building + * the "string" table on-the-fly; requiring no table to be stored in the + * compressed file. The tables used herein are shared with those of the + * compress() routine. See the definitions above. + */ +static int +zread(cookie, rbp, num) + void *cookie; + char *rbp; + int num; +{ + register u_int count; + struct s_zstate *zs; + u_char *bp, header[3]; + + if (num == 0) + return (0); + + zs = cookie; + count = num; + bp = (u_char *)rbp; + switch (state) { + case S_START: + state = S_MIDDLE; + break; + case S_MIDDLE: + goto middle; + case S_EOF: + goto eof; + } + + /* Check the magic number */ + if (fread(header, + sizeof(char), sizeof(header), fp) != sizeof(header) || + memcmp(header, magic_header, sizeof(magic_header)) != 0) { + errno = EFTYPE; + return (-1); + } + maxbits = header[2]; /* Set -b from file. */ + block_compress = maxbits & BLOCK_MASK; + maxbits &= BIT_MASK; + maxmaxcode = 1L << maxbits; + if (maxbits > BITS) { + errno = EFTYPE; + return (-1); + } + /* As above, initialize the first 256 entries in the table. */ + maxcode = MAXCODE(n_bits = INIT_BITS); + for (code = 255; code >= 0; code--) { + tab_prefixof(code) = 0; + tab_suffixof(code) = (char_type) code; + } + free_ent = block_compress ? FIRST : 256; + + finchar = oldcode = getcode(zs); + if (oldcode == -1) /* EOF already? */ + return (0); /* Get out of here */ + + /* First code must be 8 bits = char. */ + *bp++ = (u_char)finchar; + count--; + stackp = de_stack; + + while ((code = getcode(zs)) > -1) { + + if ((code == CLEAR) && block_compress) { + for (code = 255; code >= 0; code--) + tab_prefixof(code) = 0; + clear_flg = 1; + free_ent = FIRST - 1; + if ((code = getcode(zs)) == -1) /* O, untimely death! */ + break; + } + incode = code; + + /* Special case for KwKwK string. */ + if (code >= free_ent) { + *stackp++ = finchar; + code = oldcode; + } + + /* Generate output characters in reverse order. */ + while (code >= 256) { + *stackp++ = tab_suffixof(code); + code = tab_prefixof(code); + } + *stackp++ = finchar = tab_suffixof(code); + + /* And put them out in forward order. */ +middle: do { + if (count-- == 0) + return (num); + *bp++ = *--stackp; + } while (stackp > de_stack); + + /* Generate the new entry. */ + if ((code = free_ent) < maxmaxcode) { + tab_prefixof(code) = (u_short) oldcode; + tab_suffixof(code) = finchar; + free_ent = code + 1; + } + + /* Remember previous code. */ + oldcode = incode; + } + state = S_EOF; +eof: return (num - count); +} + +/*- + * Read one code from the standard input. If EOF, return -1. + * Inputs: + * stdin + * Outputs: + * code or -1 is returned. + */ +static code_int +getcode(zs) + struct s_zstate *zs; +{ + register code_int gcode; + register int r_off, bits; + register char_type *bp; + + bp = gbuf; + if (clear_flg > 0 || roffset >= size || free_ent > maxcode) { + /* + * If the next entry will be too big for the current gcode + * size, then we must increase the size. This implies reading + * a new buffer full, too. + */ + if (free_ent > maxcode) { + n_bits++; + if (n_bits == maxbits) /* Won't get any bigger now. */ + maxcode = maxmaxcode; + else + maxcode = MAXCODE(n_bits); + } + if (clear_flg > 0) { + maxcode = MAXCODE(n_bits = INIT_BITS); + clear_flg = 0; + } + size = fread(gbuf, 1, n_bits, fp); + if (size <= 0) /* End of file. */ + return (-1); + roffset = 0; + /* Round size down to integral number of codes. */ + size = (size << 3) - (n_bits - 1); + } + r_off = roffset; + bits = n_bits; + + /* Get to the first byte. */ + bp += (r_off >> 3); + r_off &= 7; + + /* Get first part (low order bits). */ + gcode = (*bp++ >> r_off); + bits -= (8 - r_off); + r_off = 8 - r_off; /* Now, roffset into gcode word. */ + + /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */ + if (bits >= 8) { + gcode |= *bp++ << r_off; + r_off += 8; + bits -= 8; + } + + /* High order bits. */ + gcode |= (*bp & rmask[bits]) << r_off; + roffset += n_bits; + + return (gcode); +} + +static int +cl_block(zs) /* Table clear for block compress. */ + struct s_zstate *zs; +{ + register long rat; + + checkpoint = in_count + CHECK_GAP; + + if (in_count > 0x007fffff) { /* Shift will overflow. */ + rat = bytes_out >> 8; + if (rat == 0) /* Don't divide by zero. */ + rat = 0x7fffffff; + else + rat = in_count / rat; + } else + rat = (in_count << 8) / bytes_out; /* 8 fractional bits. */ + if (rat > ratio) + ratio = rat; + else { + ratio = 0; + cl_hash(zs, (count_int) hsize); + free_ent = FIRST; + clear_flg = 1; + if (output(zs, (code_int) CLEAR) == -1) + return (-1); + } + return (0); +} + +static void +cl_hash(zs, cl_hsize) /* Reset code table. */ + struct s_zstate *zs; + register count_int cl_hsize; +{ + register count_int *htab_p; + register long i, m1; + + m1 = -1; + htab_p = htab + cl_hsize; + i = cl_hsize - 16; + do { /* Might use Sys V memset(3) here. */ + *(htab_p - 16) = m1; + *(htab_p - 15) = m1; + *(htab_p - 14) = m1; + *(htab_p - 13) = m1; + *(htab_p - 12) = m1; + *(htab_p - 11) = m1; + *(htab_p - 10) = m1; + *(htab_p - 9) = m1; + *(htab_p - 8) = m1; + *(htab_p - 7) = m1; + *(htab_p - 6) = m1; + *(htab_p - 5) = m1; + *(htab_p - 4) = m1; + *(htab_p - 3) = m1; + *(htab_p - 2) = m1; + *(htab_p - 1) = m1; + htab_p -= 16; + } while ((i -= 16) >= 0); + for (i += 16; i > 0; i--) + *--htab_p = m1; +} + +FILE * +zopen(fname, mode, bits) + const char *fname, *mode; + int bits; +{ + struct s_zstate *zs; + + if (mode[0] != 'r' && mode[0] != 'w' || mode[1] != '\0' || + bits < 0 || bits > BITS) { + errno = EINVAL; + return (NULL); + } + + if ((zs = calloc(1, sizeof(struct s_zstate))) == NULL) + return (NULL); + + maxbits = bits ? bits : BITS; /* User settable max # bits/code. */ + maxmaxcode = 1 << BITS; /* Should NEVER generate this code. */ + hsize = HSIZE; /* For dynamic table sizing. */ + free_ent = 0; /* First unused entry. */ + block_compress = BLOCK_MASK; + clear_flg = 0; + ratio = 0; + checkpoint = CHECK_GAP; + in_count = 1; /* Length of input. */ + out_count = 0; /* # of codes output (for debugging). */ + state = S_START; + roffset = 0; + size = 0; + + /* + * Layering compress on top of stdio in order to provide buffering, + * and ensure that reads and write work with the data specified. + */ + if ((fp = fopen(fname, mode)) == NULL) { + free(zs); + return (NULL); + } + switch (*mode) { + case 'r': + zmode = 'r'; + return (funopen(zs, zread, NULL, NULL, zclose)); + case 'w': + zmode = 'w'; + return (funopen(zs, NULL, zwrite, NULL, zclose)); + } + /* NOTREACHED */ +} |