diff options
Diffstat (limited to 'usr.bin/split')
-rw-r--r-- | usr.bin/split/Makefile | 10 | ||||
-rw-r--r-- | usr.bin/split/Makefile.depend | 16 | ||||
-rw-r--r-- | usr.bin/split/split.1 | 232 | ||||
-rw-r--r-- | usr.bin/split/split.c | 451 | ||||
-rw-r--r-- | usr.bin/split/tests/Makefile | 5 | ||||
-rw-r--r-- | usr.bin/split/tests/Makefile.depend | 10 | ||||
-rwxr-xr-x | usr.bin/split/tests/split_test.sh | 274 |
7 files changed, 998 insertions, 0 deletions
diff --git a/usr.bin/split/Makefile b/usr.bin/split/Makefile new file mode 100644 index 000000000000..99021342ec32 --- /dev/null +++ b/usr.bin/split/Makefile @@ -0,0 +1,10 @@ +.include <src.opts.mk> + +PROG= split + +LIBADD= util + +HAS_TESTS= yes +SUBDIR.${MK_TESTS}+= tests + +.include <bsd.prog.mk> diff --git a/usr.bin/split/Makefile.depend b/usr.bin/split/Makefile.depend new file mode 100644 index 000000000000..678747db6f2c --- /dev/null +++ b/usr.bin/split/Makefile.depend @@ -0,0 +1,16 @@ +# Autogenerated - do NOT edit! + +DIRDEPS = \ + include \ + include/xlocale \ + lib/${CSU_DIR} \ + lib/libc \ + lib/libcompiler_rt \ + lib/libutil \ + + +.include <dirdeps.mk> + +.if ${DEP_RELDIR} == ${_DEP_RELDIR} +# local dependencies - needed for -jN in clean tree +.endif diff --git a/usr.bin/split/split.1 b/usr.bin/split/split.1 new file mode 100644 index 000000000000..bd837f3e9c71 --- /dev/null +++ b/usr.bin/split/split.1 @@ -0,0 +1,232 @@ +.\" Copyright (c) 1990, 1991, 1993, 1994 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.Dd May 26, 2023 +.Dt SPLIT 1 +.Os +.Sh NAME +.Nm split +.Nd split a file into pieces +.Sh SYNOPSIS +.Nm +.Op Fl cd +.Op Fl l Ar line_count +.Op Fl a Ar suffix_length +.Op Ar file Op Ar prefix +.Nm +.Op Fl cd +.Fl b Ar byte_count Ns +.Oo +.Sm off +.Cm K | k | M | m | G | g +.Sm on +.Oc +.Op Fl a Ar suffix_length +.Op Ar file Op Ar prefix +.Nm +.Op Fl cd +.Fl n Ar chunk_count +.Op Fl a Ar suffix_length +.Op Ar file Op Ar prefix +.Nm +.Op Fl cd +.Fl p Ar pattern +.Op Fl a Ar suffix_length +.Op Ar file Op Ar prefix +.Sh DESCRIPTION +The +.Nm +utility reads the given +.Ar file +and breaks it up into files of 1000 lines each +(if no options are specified), leaving the +.Ar file +unchanged. +If +.Ar file +is a single dash +.Pq Sq Fl +or absent, +.Nm +reads from the standard input. +.Pp +The options are as follows: +.Bl -tag -width indent +.It Fl a Ar suffix_length +Use +.Ar suffix_length +letters to form the suffix of the file name. +.It Fl b Ar byte_count Ns Oo +.Sm off +.Cm K | k | M | m | G | g +.Sm on +.Oc +Create split files +.Ar byte_count +bytes in length. +If +.Cm k +or +.Cm K +is appended to the number, the file is split into +.Ar byte_count +kilobyte pieces. +If +.Cm m +or +.Cm M +is appended to the number, the file is split into +.Ar byte_count +megabyte pieces. +If +.Cm g +or +.Cm G +is appended to the number, the file is split into +.Ar byte_count +gigabyte pieces. +.It Fl c +Continue creating files and do not overwrite existing +output files. +.It Fl d +Use a numeric suffix instead of a alphabetic suffix. +.It Fl l Ar line_count +Create split files +.Ar line_count +lines in length. +.It Fl n Ar chunk_count +Split file into +.Ar chunk_count +smaller files. +The first n - 1 files will be of size (size of +.Ar file +/ +.Ar chunk_count +) +and the last file will contain the remaining bytes. +.It Fl p Ar pattern +The file is split whenever an input line matches +.Ar pattern , +which is interpreted as an extended regular expression. +The matching line will be the first line of the next output file. +This option is incompatible with the +.Fl b +and +.Fl l +options. +.El +.Pp +If additional arguments are specified, the first is used as the name +of the input file which is to be split. +If a second additional argument is specified, it is used as a prefix +for the names of the files into which the file is split. +In this case, each file into which the file is split is named by the +prefix followed by a lexically ordered suffix using +.Ar suffix_length +characters in the range +.Dq Li a Ns - Ns Li z . +If +.Fl a +is not specified, two letters are used as the initial suffix. +If the output does not fit into the resulting number of files and the +.Fl d +flag is not specified, then the suffix length is automatically extended as +needed such that all output files continue to sort in lexical order. +.Pp +If the +.Ar prefix +argument is not specified, the file is split into lexically ordered +files named with the prefix +.Dq Li x +and with suffixes as above. +.Pp +By default, +.Nm +will overwrite any existing output files. +If the +.Fl c +flag is specified, +.Nm +will instead create files with names that do not already exist. +.Sh ENVIRONMENT +The +.Ev LANG , LC_ALL , LC_CTYPE +and +.Ev LC_COLLATE +environment variables affect the execution of +.Nm +as described in +.Xr environ 7 . +.Sh EXIT STATUS +.Ex -std +.Sh EXAMPLES +Split input into as many files as needed, so that each file contains at most 2 +lines: +.Bd -literal -offset indent +$ echo -e "first line\\nsecond line\\nthird line\\nforth line" | split -l2 +.Ed +.Pp +Split input in chunks of 10 bytes using numeric prefixes for file names. +This generates two files of 10 bytes (x00 and x01) and a third file (x02) with the +remaining 2 bytes: +.Bd -literal -offset indent +$ echo -e "This is 22 bytes long" | split -d -b10 +.Ed +.Pp +Split input generating 6 files: +.Bd -literal -offset indent +$ echo -e "This is 22 bytes long" | split -n 6 +.Ed +.Pp +Split input creating a new file every time a line matches the regular expression +for a +.Dq t +followed by either +.Dq a +or +.Dq u +thus creating two files: +.Bd -literal -offset indent +$ echo -e "stack\\nstock\\nstuck\\nanother line" | split -p 't[au]' +.Ed +.Sh SEE ALSO +.Xr csplit 1 , +.Xr re_format 7 +.Sh STANDARDS +The +.Nm +utility conforms to +.St -p1003.1-2001 . +.Sh HISTORY +A +.Nm +command appeared in +.At v3 . +.Pp +Before +.Fx 14 , +pattern and line matching only operated on lines shorter than 65,536 bytes. diff --git a/usr.bin/split/split.c b/usr.bin/split/split.c new file mode 100644 index 000000000000..2724f8a20cde --- /dev/null +++ b/usr.bin/split/split.c @@ -0,0 +1,451 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1987, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/stat.h> + +#include <ctype.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <inttypes.h> +#include <libutil.h> +#include <limits.h> +#include <locale.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <regex.h> +#include <sysexits.h> + +#define DEFLINE 1000 /* Default num lines per file. */ + +static off_t bytecnt; /* Byte count to split on. */ +static long chunks; /* Chunks count to split into. */ +static bool clobber = true; /* Whether to overwrite existing output files. */ +static long numlines; /* Line count to split on. */ +static int file_open; /* If a file open. */ +static int ifd = -1, ofd = -1; /* Input/output file descriptors. */ +static char fname[MAXPATHLEN]; /* File name prefix. */ +static regex_t rgx; +static int pflag; +static bool dflag; +static long sufflen = 2; /* File name suffix length. */ +static bool autosfx = true; /* Whether to auto-extend the suffix length. */ + +static void newfile(void); +static void split1(void); +static void split2(void); +static void split3(void); +static void usage(void) __dead2; + +int +main(int argc, char **argv) +{ + char errbuf[64]; + const char *p, *errstr; + int ch, error; + + setlocale(LC_ALL, ""); + + dflag = false; + while ((ch = getopt(argc, argv, "0::1::2::3::4::5::6::7::8::9::a:b:cdl:n:p:")) != -1) + switch (ch) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + /* + * Undocumented kludge: split was originally designed + * to take a number after a dash. + */ + if (numlines != 0) + usage(); + numlines = ch - '0'; + p = optarg ? optarg : ""; + while (numlines >= 0 && *p >= '0' && *p <= '9') + numlines = numlines * 10 + *p++ - '0'; + if (numlines <= 0 || *p != '\0') + errx(EX_USAGE, "%c%s: line count is invalid", + ch, optarg ? optarg : ""); + break; + case 'a': /* Suffix length */ + sufflen = strtonum(optarg, 0, INT_MAX, &errstr); + if (errstr != NULL) { + errx(EX_USAGE, "%s: suffix length is %s", + optarg, errstr); + } + if (sufflen == 0) { + sufflen = 2; + autosfx = true; + } else { + autosfx = false; + } + break; + case 'b': /* Byte count. */ + if (expand_number(optarg, &bytecnt) != 0) { + errx(EX_USAGE, "%s: byte count is invalid", + optarg); + } + break; + case 'c': /* Continue, don't overwrite output files. */ + clobber = false; + break; + case 'd': /* Decimal suffix */ + dflag = true; + break; + case 'l': /* Line count. */ + if (numlines != 0) + usage(); + numlines = strtonum(optarg, 1, LONG_MAX, &errstr); + if (errstr != NULL) { + errx(EX_USAGE, "%s: line count is %s", + optarg, errstr); + } + break; + case 'n': /* Chunks. */ + chunks = strtonum(optarg, 1, LONG_MAX, &errstr); + if (errstr != NULL) { + errx(EX_USAGE, "%s: number of chunks is %s", + optarg, errstr); + } + break; + + case 'p': /* pattern matching. */ + error = regcomp(&rgx, optarg, REG_EXTENDED|REG_NOSUB); + if (error != 0) { + regerror(error, &rgx, errbuf, sizeof(errbuf)); + errx(EX_USAGE, "%s: regex is invalid: %s", + optarg, errbuf); + } + pflag = 1; + break; + default: + usage(); + } + argv += optind; + argc -= optind; + + if (argc > 0) { /* Input file. */ + if (strcmp(*argv, "-") == 0) + ifd = STDIN_FILENO; + else if ((ifd = open(*argv, O_RDONLY, 0)) < 0) + err(EX_NOINPUT, "%s", *argv); + ++argv; + --argc; + } + if (argc > 0) { /* File name prefix. */ + if (strlcpy(fname, *argv, sizeof(fname)) >= sizeof(fname)) { + errx(EX_USAGE, "%s: file name prefix is too long", + *argv); + } + ++argv; + --argc; + } + if (argc > 0) + usage(); + + if (strlen(fname) + (unsigned long)sufflen >= sizeof(fname)) + errx(EX_USAGE, "suffix is too long"); + if (pflag && (numlines != 0 || bytecnt != 0 || chunks != 0)) + usage(); + + if (numlines == 0) + numlines = DEFLINE; + else if (bytecnt != 0 || chunks != 0) + usage(); + + if (bytecnt != 0 && chunks != 0) + usage(); + + if (ifd == -1) /* Stdin by default. */ + ifd = 0; + + if (bytecnt != 0) { + split1(); + exit (0); + } else if (chunks != 0) { + split3(); + exit (0); + } + split2(); + if (pflag) + regfree(&rgx); + exit(0); +} + +/* + * split1 -- + * Split the input by bytes. + */ +static void +split1(void) +{ + static char bfr[MAXBSIZE]; + off_t bcnt; + char *C; + ssize_t dist, len; + int nfiles; + + nfiles = 0; + + for (bcnt = 0;;) + switch ((len = read(ifd, bfr, sizeof(bfr)))) { + case 0: + exit(0); + case -1: + err(EX_IOERR, "read"); + /* NOTREACHED */ + default: + if (!file_open) { + if (chunks == 0 || nfiles < chunks) { + newfile(); + nfiles++; + } + } + if (bcnt + len >= bytecnt) { + dist = bytecnt - bcnt; + if (write(ofd, bfr, dist) != dist) + err(EX_IOERR, "write"); + len -= dist; + for (C = bfr + dist; len >= bytecnt; + len -= bytecnt, C += bytecnt) { + if (chunks == 0 || nfiles < chunks) { + newfile(); + nfiles++; + } + if (write(ofd, C, bytecnt) != bytecnt) + err(EX_IOERR, "write"); + } + if (len != 0) { + if (chunks == 0 || nfiles < chunks) { + newfile(); + nfiles++; + } + if (write(ofd, C, len) != len) + err(EX_IOERR, "write"); + } else { + file_open = 0; + } + bcnt = len; + } else { + bcnt += len; + if (write(ofd, bfr, len) != len) + err(EX_IOERR, "write"); + } + } +} + +/* + * split2 -- + * Split the input by lines. + */ +static void +split2(void) +{ + char *buf; + size_t bufsize; + ssize_t len; + long lcnt = 0; + FILE *infp; + + buf = NULL; + bufsize = 0; + + /* Stick a stream on top of input file descriptor */ + if ((infp = fdopen(ifd, "r")) == NULL) + err(EX_NOINPUT, "fdopen"); + + /* Process input one line at a time */ + while ((errno = 0, len = getline(&buf, &bufsize, infp)) > 0) { + /* Check if we need to start a new file */ + if (pflag) { + regmatch_t pmatch; + + pmatch.rm_so = 0; + pmatch.rm_eo = len - 1; + if (regexec(&rgx, buf, 0, &pmatch, REG_STARTEND) == 0) + newfile(); + } else if (lcnt++ == numlines) { + newfile(); + lcnt = 1; + } + + /* Open output file if needed */ + if (!file_open) + newfile(); + + /* Write out line */ + if (write(ofd, buf, len) != len) + err(EX_IOERR, "write"); + } + + /* EOF or error? */ + if ((len == -1 && errno != 0) || ferror(infp)) + err(EX_IOERR, "read"); + else + exit(0); +} + +/* + * split3 -- + * Split the input into specified number of chunks + */ +static void +split3(void) +{ + struct stat sb; + + if (fstat(ifd, &sb) == -1) { + err(1, "stat"); + /* NOTREACHED */ + } + + if (chunks > sb.st_size) { + errx(1, "can't split into more than %d files", + (int)sb.st_size); + /* NOTREACHED */ + } + + bytecnt = sb.st_size / chunks; + split1(); +} + + +/* + * newfile -- + * Open a new output file. + */ +static void +newfile(void) +{ + long i, maxfiles, tfnum; + static long fnum; + static char *fpnt; + char beg, end; + int pattlen; + int flags = O_WRONLY | O_CREAT | O_TRUNC; + + if (!clobber) + flags |= O_EXCL; + + if (ofd == -1) { + if (fname[0] == '\0') { + fname[0] = 'x'; + fpnt = fname + 1; + } else { + fpnt = fname + strlen(fname); + } + } else if (close(ofd) != 0) + err(1, "%s", fname); + + again: + if (dflag) { + beg = '0'; + end = '9'; + } + else { + beg = 'a'; + end = 'z'; + } + pattlen = end - beg + 1; + + /* + * If '-a' is not specified, then we automatically expand the + * suffix length to accomodate splitting all input. We do this + * by moving the suffix pointer (fpnt) forward and incrementing + * sufflen by one, thereby yielding an additional two characters + * and allowing all output files to sort such that 'cat *' yields + * the input in order. I.e., the order is '... xyy xyz xzaaa + * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on. + */ + if (!dflag && autosfx && (fpnt[0] == 'y') && + strspn(fpnt+1, "z") == strlen(fpnt+1)) { + /* Ensure the generated filenames will fit into the buffer. */ + if (strlen(fname) + 2 >= sizeof(fname)) + errx(EX_USAGE, "combined filenames would be too long"); + + fpnt = fname + strlen(fname) - sufflen; + fpnt[sufflen + 2] = '\0'; + fpnt[0] = end; + fpnt[1] = beg; + + /* Basename | Suffix + * before: + * x | yz + * after: + * xz | a.. */ + fpnt++; + sufflen++; + + /* Reset so we start back at all 'a's in our extended suffix. */ + fnum = 0; + } + + /* maxfiles = pattlen^sufflen, but don't use libm. */ + for (maxfiles = 1, i = 0; i < sufflen; i++) + if (LONG_MAX / pattlen < maxfiles) + errx(EX_USAGE, "suffix is too long (max %ld)", i); + else + maxfiles *= pattlen; + + if (fnum == maxfiles) + errx(EX_DATAERR, "too many files"); + + /* Generate suffix of sufflen letters */ + tfnum = fnum; + i = sufflen - 1; + do { + fpnt[i] = tfnum % pattlen + beg; + tfnum /= pattlen; + } while (i-- > 0); + fpnt[sufflen] = '\0'; + + ++fnum; + if ((ofd = open(fname, flags, DEFFILEMODE)) < 0) { + if (!clobber && errno == EEXIST) + goto again; + err(EX_IOERR, "%s", fname); + } + file_open = 1; +} + +static void +usage(void) +{ + (void)fprintf(stderr, +"usage: split [-cd] [-l line_count] [-a suffix_length] [file [prefix]]\n" +" split [-cd] -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n" +" split [-cd] -n chunk_count [-a suffix_length] [file [prefix]]\n" +" split [-cd] -p pattern [-a suffix_length] [file [prefix]]\n"); + exit(EX_USAGE); +} diff --git a/usr.bin/split/tests/Makefile b/usr.bin/split/tests/Makefile new file mode 100644 index 000000000000..65f62cff26f1 --- /dev/null +++ b/usr.bin/split/tests/Makefile @@ -0,0 +1,5 @@ +PACKAGE= tests + +ATF_TESTS_SH+= split_test + +.include <bsd.test.mk> diff --git a/usr.bin/split/tests/Makefile.depend b/usr.bin/split/tests/Makefile.depend new file mode 100644 index 000000000000..11aba52f82cf --- /dev/null +++ b/usr.bin/split/tests/Makefile.depend @@ -0,0 +1,10 @@ +# Autogenerated - do NOT edit! + +DIRDEPS = \ + + +.include <dirdeps.mk> + +.if ${DEP_RELDIR} == ${_DEP_RELDIR} +# local dependencies - needed for -jN in clean tree +.endif diff --git a/usr.bin/split/tests/split_test.sh b/usr.bin/split/tests/split_test.sh new file mode 100755 index 000000000000..48065719055a --- /dev/null +++ b/usr.bin/split/tests/split_test.sh @@ -0,0 +1,274 @@ +# +# SPDX-License-Identifier: BSD-2-Clause +# +# Copyright (c) 2022-2023 Klara Systems +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +# sys/param.h +: ${MAXBSIZE:=65536} + +atf_test_case bytes +bytes_body() +{ + printf "aaaa" > foo-aa + printf "bb\nc" > foo-ab + printf "ccc\n" > foo-ac + + cat foo-* > foo + atf_check split -b 4 foo split- + atf_check -o file:foo-aa cat split-aa + atf_check -o file:foo-ab cat split-ab + atf_check -o file:foo-ac cat split-ac + + # MAXBSIZE is the default buffer size, so we'll split at just a little + # bit past the buffer size to make sure that it still properly splits + # even when it needs to read again to hit the limit. + bsize=$((MAXBSIZE + 12)) + rm foo-* foo + jot -ns "" -b "a" ${bsize} > foo-aa + jot -ns "" -b "b" ${bsize} > foo-ab + jot -ns "" -b "c" 12 > foo-ac + + cat foo-* > foo + atf_check split -b ${bsize} foo split- + atf_check -o file:foo-aa cat split-aa + atf_check -o file:foo-ab cat split-ab + atf_check -o file:foo-ac cat split-ac +} + +atf_test_case chunks +chunks_body() +{ + jot -ns "" -b "a" 4096 > foo + jot -ns "" -b "b" 4096 >> foo + jot -ns "" -b "c" 4104 >> foo + + chunks=3 + jot -ns "" -b "a" 4096 > foo-aa + jot -ns "" -b "b" 2 >> foo-aa + jot -ns "" -b "b" 4094 > foo-ab + jot -ns "" -b "c" 4 >> foo-ab + jot -ns "" -b "c" 4100 > foo-ac + + atf_check split -n ${chunks} foo split- + atf_check -o file:foo-aa cat split-aa + atf_check -o file:foo-ab cat split-ab + atf_check -o file:foo-ac cat split-ac +} + +atf_test_case sensible_lines +sensible_lines_body() +{ + echo "The quick brown fox" > foo-aa + echo "jumps over" > foo-ab + echo "the lazy dog" > foo-ac + + cat foo-* > foo + atf_check split -l 1 foo split- + atf_check -o file:foo-aa cat split-aa + atf_check -o file:foo-ab cat split-ab + atf_check -o file:foo-ac cat split-ac + + # Try again, make sure that `-` uses stdin as documented. + atf_check rm split-* + atf_check -x 'split -l 1 - split- < foo' + atf_check -o file:foo-aa cat split-aa + atf_check -o file:foo-ab cat split-ab + atf_check -o file:foo-ac cat split-ac + + # Finally, try with -l == 2; we should see a 2/1 split instead of the + # previous 1/1/1. + cat foo-aa foo-ab > foo-aa-ng + cat foo-ac > foo-ab-ng + + atf_check rm split-* + atf_check split -l 2 foo split- + + atf_check -o file:foo-aa-ng cat split-aa + atf_check -o file:foo-ab-ng cat split-ab +} + +atf_test_case long_lines +long_lines_body() +{ + + # Test file lines will be: + # a x MAXBSIZE + # b x MAXBSIZE + c x MAXBSIZE + # d x 1024 + # + # The historical split(1) implementation wouldn't grow its internal + # buffer, so we'd end up with 2/3 split- files being wrong with -l 1. + # Notably, split-aa would include most of the first two lines, split-ab + # a tiny fraction of the second line, and split-ac the third line. + # + # Recent split(1) instead grows the buffer until we can either fit the + # line or we run out of memory. + jot -s "" -b "a" ${MAXBSIZE} > foo-aa + jot -ns "" -b "b" ${MAXBSIZE} > foo-ab + jot -s "" -b "c" ${MAXBSIZE} >> foo-ab + jot -s "" -b "d" 1024 > foo-ac + + cat foo-* > foo + atf_check split -l 1 foo split- + + atf_check -o file:foo-aa cat split-aa + atf_check -o file:foo-ab cat split-ab + atf_check -o file:foo-ac cat split-ac +} + +atf_test_case numeric_suffix +numeric_suffix_body() +{ + echo "The quick brown fox" > foo-00 + echo "jumps over" > foo-01 + echo "the lazy dog" > foo-02 + + cat foo-* > foo + atf_check split -d -l 1 foo split- + + atf_check -o file:foo-00 cat split-00 + atf_check -o file:foo-01 cat split-01 + atf_check -o file:foo-02 cat split-02 +} + +atf_test_case larger_suffix_length +larger_suffix_length_body() +{ + :> foo + + # Generate foo-000 through foo-009, then foo-010 and foo-011 + for i in $(seq -w 0 11); do + len=$((${i##0} + 1)) + file="foo-0${i}" + jot -s "" -b "a" ${len} > ${file} + cat ${file} >> foo + done + + atf_check split -a 3 -d -l 1 foo split- + for i in $(seq -w 0 11); do + srcfile="foo-0${i}" + splitfile="split-0${i}" + atf_check -o file:"${srcfile}" cat "${splitfile}" + done +} + +atf_test_case pattern +pattern_body() +{ + + # Some fake yaml gives us a good realistic use-case for -p, as we can + # split on top-level stanzas. + cat <<EOF > foo-aa +cat: + aa: true + ab: true + ac: true +EOF + cat <<EOF > foo-ab +dog: + ba: true + bb: true + bc: true +EOF + + cat foo-* > foo + + atf_check split -p "^[^[:space:]]+:" foo split- + atf_check -o file:foo-aa cat split-aa + atf_check -o file:foo-ab cat split-ab +} + +atf_test_case autoextend +autoextend_body() +{ + seq $((26*25+1)) >input + atf_check split -l1 input + atf_check -o inline:"$((26*25))\n" cat xyz + atf_check -o inline:"$((26*25+1))\n" cat xzaaa +} + +atf_test_case noautoextend +noautoextend_body() +{ + seq $((26*26)) >input + atf_check split -a2 -l1 input + atf_check -o inline:"$((26*26))\n" cat xzz +} + +atf_test_case reautoextend +reautoextend_body() +{ + seq $((26*25+1)) >input + atf_check split -a2 -a0 -l1 input + atf_check -o inline:"$((26*25))\n" cat xyz + atf_check -o inline:"$((26*25+1))\n" cat xzaaa +} + +atf_test_case continue +continue_body() +{ + echo hello >input + atf_check split input + atf_check -o file:input cat xaa + atf_check -s exit:1 -e ignore cat xab + atf_check split -c input + atf_check -o file:input cat xab +} + +atf_test_case undocumented_kludge +undocumented_kludge_body() +{ + seq 5000 >input + atf_check split -1000 input + atf_check -o file:xae seq 4001 5000 + atf_check split -d1000 input + atf_check -o file:x04 seq 4001 5000 +} + +atf_test_case duplicate_linecount +duplicate_linecount_body() +{ + atf_check -s exit:64 -e ignore split -5 -5 /dev/null + atf_check -s exit:64 -e ignore split -l5 -5 /dev/null + atf_check -s exit:64 -e ignore split -5 -l5 /dev/null + atf_check -s exit:64 -e ignore split -l5 -l5 /dev/null +} + +atf_init_test_cases() +{ + atf_add_test_case bytes + atf_add_test_case chunks + atf_add_test_case sensible_lines + atf_add_test_case long_lines + atf_add_test_case numeric_suffix + atf_add_test_case larger_suffix_length + atf_add_test_case pattern + atf_add_test_case autoextend + atf_add_test_case noautoextend + atf_add_test_case reautoextend + atf_add_test_case continue + atf_add_test_case undocumented_kludge + atf_add_test_case duplicate_linecount +} |