diff options
Diffstat (limited to 'lib/diff_atomize_text.c')
-rw-r--r-- | lib/diff_atomize_text.c | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/lib/diff_atomize_text.c b/lib/diff_atomize_text.c new file mode 100644 index 000000000000..32023105af94 --- /dev/null +++ b/lib/diff_atomize_text.c @@ -0,0 +1,197 @@ +/* Split source by line breaks, and calculate a simplistic checksum. */ +/* + * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <errno.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <ctype.h> + +#include <arraylist.h> +#include <diff_main.h> + +#include "diff_internal.h" +#include "diff_debug.h" + +unsigned int +diff_atom_hash_update(unsigned int hash, unsigned char atom_byte) +{ + return hash * 23 + atom_byte; +} + +static int +diff_data_atomize_text_lines_fd(struct diff_data *d) +{ + off_t pos = 0; + const off_t end = pos + d->len; + unsigned int array_size_estimate = d->len / 50; + unsigned int pow2 = 1; + bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); + bool embedded_nul = false; + + while (array_size_estimate >>= 1) + pow2++; + + ARRAYLIST_INIT(d->atoms, 1 << pow2); + + if (fseek(d->root->f, 0L, SEEK_SET) == -1) + return errno; + + while (pos < end) { + off_t line_end = pos; + unsigned int hash = 0; + unsigned char buf[512]; + size_t r, i; + struct diff_atom *atom; + int eol = 0; + + while (eol == 0 && line_end < end) { + r = fread(buf, sizeof(char), sizeof(buf), d->root->f); + if (r == 0 && ferror(d->root->f)) + return EIO; + i = 0; + while (eol == 0 && i < r) { + if (buf[i] != '\r' && buf[i] != '\n') { + if (!ignore_whitespace + || !isspace((unsigned char)buf[i])) + hash = diff_atom_hash_update( + hash, buf[i]); + if (buf[i] == '\0') + embedded_nul = true; + line_end++; + } else + eol = buf[i]; + i++; + } + } + + /* When not at the end of data, the line ending char ('\r' or + * '\n') must follow */ + if (line_end < end) + line_end++; + /* If that was an '\r', also pull in any following '\n' */ + if (line_end < end && eol == '\r') { + if (fseeko(d->root->f, line_end, SEEK_SET) == -1) + return errno; + r = fread(buf, sizeof(char), sizeof(buf), d->root->f); + if (r == 0 && ferror(d->root->f)) + return EIO; + if (r > 0 && buf[0] == '\n') + line_end++; + } + + /* Record the found line as diff atom */ + ARRAYLIST_ADD(atom, d->atoms); + if (!atom) + return ENOMEM; + + *atom = (struct diff_atom){ + .root = d, + .pos = pos, + .at = NULL, /* atom data is not memory-mapped */ + .len = line_end - pos, + .hash = hash, + }; + + /* Starting point for next line: */ + pos = line_end; + if (fseeko(d->root->f, pos, SEEK_SET) == -1) + return errno; + } + + /* File are considered binary if they contain embedded '\0' bytes. */ + if (embedded_nul) + d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; + + return DIFF_RC_OK; +} + +static int +diff_data_atomize_text_lines_mmap(struct diff_data *d) +{ + const uint8_t *pos = d->data; + const uint8_t *end = pos + d->len; + bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); + bool embedded_nul = false; + unsigned int array_size_estimate = d->len / 50; + unsigned int pow2 = 1; + while (array_size_estimate >>= 1) + pow2++; + + ARRAYLIST_INIT(d->atoms, 1 << pow2); + + while (pos < end) { + const uint8_t *line_end = pos; + unsigned int hash = 0; + + while (line_end < end && *line_end != '\r' && *line_end != '\n') { + if (!ignore_whitespace + || !isspace((unsigned char)*line_end)) + hash = diff_atom_hash_update(hash, *line_end); + if (*line_end == '\0') + embedded_nul = true; + line_end++; + } + + /* When not at the end of data, the line ending char ('\r' or + * '\n') must follow */ + if (line_end < end && *line_end == '\r') + line_end++; + if (line_end < end && *line_end == '\n') + line_end++; + + /* Record the found line as diff atom */ + struct diff_atom *atom; + ARRAYLIST_ADD(atom, d->atoms); + if (!atom) + return ENOMEM; + + *atom = (struct diff_atom){ + .root = d, + .pos = (off_t)(pos - d->data), + .at = pos, + .len = line_end - pos, + .hash = hash, + }; + + /* Starting point for next line: */ + pos = line_end; + } + + /* File are considered binary if they contain embedded '\0' bytes. */ + if (embedded_nul) + d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; + + return DIFF_RC_OK; +} + +static int +diff_data_atomize_text_lines(struct diff_data *d) +{ + if (d->data == NULL) + return diff_data_atomize_text_lines_fd(d); + else + return diff_data_atomize_text_lines_mmap(d); +} + +int +diff_atomize_text_by_line(void *func_data, struct diff_data *d) +{ + return diff_data_atomize_text_lines(d); +} |