diff options
Diffstat (limited to 'contrib/libdiff/lib/diff_main.c')
-rw-r--r-- | contrib/libdiff/lib/diff_main.c | 664 |
1 files changed, 664 insertions, 0 deletions
diff --git a/contrib/libdiff/lib/diff_main.c b/contrib/libdiff/lib/diff_main.c new file mode 100644 index 000000000000..1cf91c0a1bdb --- /dev/null +++ b/contrib/libdiff/lib/diff_main.c @@ -0,0 +1,664 @@ +/* Generic infrastructure to implement various diff algorithms (implementation). */ +/* + * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + + +#include <sys/queue.h> +#include <ctype.h> +#include <errno.h> +#include <stdint.h> +#include <stdlib.h> +#include <stdbool.h> +#include <stdio.h> +#include <string.h> +#include <limits.h> +#include <unistd.h> + +#include <assert.h> + +#include <arraylist.h> +#include <diff_main.h> + +#include "diff_internal.h" +#include "diff_debug.h" + +inline enum diff_chunk_type +diff_chunk_type(const struct diff_chunk *chunk) +{ + if (!chunk->left_count && !chunk->right_count) + return CHUNK_EMPTY; + if (!chunk->solved) + return CHUNK_ERROR; + if (!chunk->right_count) + return CHUNK_MINUS; + if (!chunk->left_count) + return CHUNK_PLUS; + if (chunk->left_count != chunk->right_count) + return CHUNK_ERROR; + return CHUNK_SAME; +} + +static int +read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len) +{ + ssize_t r; + + if (fseeko(f, at_pos, SEEK_SET) == -1) + return errno; + r = fread(buf, sizeof(char), len, f); + if ((r == 0 || r < len) && ferror(f)) + return EIO; + if (r != len) + return EIO; + return 0; +} + +static int +buf_cmp(const unsigned char *left, size_t left_len, + const unsigned char *right, size_t right_len, + bool ignore_whitespace) +{ + int cmp; + + if (ignore_whitespace) { + int il = 0, ir = 0; + while (il < left_len && ir < right_len) { + unsigned char cl = left[il]; + unsigned char cr = right[ir]; + + if (isspace((unsigned char)cl) && il < left_len) { + il++; + continue; + } + if (isspace((unsigned char)cr) && ir < right_len) { + ir++; + continue; + } + + if (cl > cr) + return 1; + if (cr > cl) + return -1; + il++; + ir++; + } + while (il < left_len) { + unsigned char cl = left[il++]; + if (!isspace((unsigned char)cl)) + return 1; + } + while (ir < right_len) { + unsigned char cr = right[ir++]; + if (!isspace((unsigned char)cr)) + return -1; + } + + return 0; + } + + cmp = memcmp(left, right, MIN(left_len, right_len)); + if (cmp) + return cmp; + if (left_len == right_len) + return 0; + return (left_len > right_len) ? 1 : -1; +} + +int +diff_atom_cmp(int *cmp, + const struct diff_atom *left, + const struct diff_atom *right) +{ + off_t remain_left, remain_right; + int flags = (left->root->diff_flags | right->root->diff_flags); + bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE); + + if (!left->len && !right->len) { + *cmp = 0; + return 0; + } + if (!ignore_whitespace) { + if (!right->len) { + *cmp = 1; + return 0; + } + if (!left->len) { + *cmp = -1; + return 0; + } + } + + if (left->at != NULL && right->at != NULL) { + *cmp = buf_cmp(left->at, left->len, right->at, right->len, + ignore_whitespace); + return 0; + } + + remain_left = left->len; + remain_right = right->len; + while (remain_left > 0 || remain_right > 0) { + const size_t chunksz = 8192; + unsigned char buf_left[chunksz], buf_right[chunksz]; + const uint8_t *p_left, *p_right; + off_t n_left, n_right; + int r; + + if (!remain_right) { + *cmp = 1; + return 0; + } + if (!remain_left) { + *cmp = -1; + return 0; + } + + n_left = MIN(chunksz, remain_left); + n_right = MIN(chunksz, remain_right); + + if (left->at == NULL) { + r = read_at(left->root->f, + left->pos + (left->len - remain_left), + buf_left, n_left); + if (r) { + *cmp = 0; + return r; + } + p_left = buf_left; + } else { + p_left = left->at + (left->len - remain_left); + } + + if (right->at == NULL) { + r = read_at(right->root->f, + right->pos + (right->len - remain_right), + buf_right, n_right); + if (r) { + *cmp = 0; + return r; + } + p_right = buf_right; + } else { + p_right = right->at + (right->len - remain_right); + } + + r = buf_cmp(p_left, n_left, p_right, n_right, + ignore_whitespace); + if (r) { + *cmp = r; + return 0; + } + + remain_left -= n_left; + remain_right -= n_right; + } + + *cmp = 0; + return 0; +} + +int +diff_atom_same(bool *same, + const struct diff_atom *left, + const struct diff_atom *right) +{ + int cmp; + int r; + if (left->hash != right->hash) { + *same = false; + return 0; + } + r = diff_atom_cmp(&cmp, left, right); + if (r) { + *same = true; + return r; + } + *same = (cmp == 0); + return 0; +} + +static struct diff_chunk * +diff_state_add_solved_chunk(struct diff_state *state, + const struct diff_chunk *chunk) +{ + diff_chunk_arraylist_t *result; + struct diff_chunk *new_chunk; + enum diff_chunk_type last_t; + enum diff_chunk_type new_t; + struct diff_chunk *last; + + /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk + * never directly follows a plus chunk. */ + result = &state->result->chunks; + + last_t = result->len ? diff_chunk_type(&result->head[result->len - 1]) + : CHUNK_EMPTY; + new_t = diff_chunk_type(chunk); + + debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED", + result->len); + debug("L\n"); + debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count); + debug("R\n"); + debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count); + + if (result->len) { + last = &result->head[result->len - 1]; + assert(chunk->left_start + == last->left_start + last->left_count); + assert(chunk->right_start + == last->right_start + last->right_count); + } + + if (new_t == last_t) { + new_chunk = &result->head[result->len - 1]; + new_chunk->left_count += chunk->left_count; + new_chunk->right_count += chunk->right_count; + debug(" - added chunk touches previous one of same type, joined:\n"); + debug("L\n"); + debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count); + debug("R\n"); + debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count); + } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) { + enum diff_chunk_type prev_last_t = + result->len > 1 ? + diff_chunk_type(&result->head[result->len - 2]) + : CHUNK_EMPTY; + /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk-> + * Is the one before that also a minus? combine. */ + if (prev_last_t == CHUNK_MINUS) { + new_chunk = &result->head[result->len - 2]; + new_chunk->left_count += chunk->left_count; + new_chunk->right_count += chunk->right_count; + + debug(" - added minus-chunk follows plus-chunk," + " put before that plus-chunk and joined" + " with preceding minus-chunk:\n"); + debug("L\n"); + debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count); + debug("R\n"); + debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count); + } else { + ARRAYLIST_INSERT(new_chunk, *result, result->len - 1); + if (!new_chunk) + return NULL; + *new_chunk = *chunk; + + /* The new minus chunk indicates to which position on + * the right it corresponds, even though it doesn't add + * any lines on the right. By moving above a plus chunk, + * that position on the right has shifted. */ + last = &result->head[result->len - 1]; + new_chunk->right_start = last->right_start; + + debug(" - added minus-chunk follows plus-chunk," + " put before that plus-chunk\n"); + } + + /* That last_t == CHUNK_PLUS indicates to which position on the + * left it corresponds, even though it doesn't add any lines on + * the left. By inserting/extending the prev_last_t == + * CHUNK_MINUS, that position on the left has shifted. */ + last = &result->head[result->len - 1]; + last->left_start = new_chunk->left_start + + new_chunk->left_count; + + } else { + ARRAYLIST_ADD(new_chunk, *result); + if (!new_chunk) + return NULL; + *new_chunk = *chunk; + } + return new_chunk; +} + +/* Even if a left or right side is empty, diff output may need to know the + * position in that file. + * So left_start or right_start must never be NULL -- pass left_count or + * right_count as zero to indicate staying at that position without consuming + * any lines. */ +struct diff_chunk * +diff_state_add_chunk(struct diff_state *state, bool solved, + struct diff_atom *left_start, unsigned int left_count, + struct diff_atom *right_start, unsigned int right_count) +{ + struct diff_chunk *new_chunk; + struct diff_chunk chunk = { + .solved = solved, + .left_start = left_start, + .left_count = left_count, + .right_start = right_start, + .right_count = right_count, + }; + + /* An unsolved chunk means store as intermediate result for later + * re-iteration. + * If there already are intermediate results, that means even a + * following solved chunk needs to go to intermediate results, so that + * it is later put in the final correct position in solved chunks. + */ + if (!solved || state->temp_result.len) { + /* Append to temp_result */ + debug("ADD %s chunk to temp result:\n", + chunk.solved ? "solved" : "UNSOLVED"); + debug("L\n"); + debug_dump_atoms(&state->left, left_start, left_count); + debug("R\n"); + debug_dump_atoms(&state->right, right_start, right_count); + ARRAYLIST_ADD(new_chunk, state->temp_result); + if (!new_chunk) + return NULL; + *new_chunk = chunk; + return new_chunk; + } + + return diff_state_add_solved_chunk(state, &chunk); +} + +static void +diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data, + unsigned long long len, int diff_flags) +{ + *d = (struct diff_data){ + .f = f, + .pos = 0, + .data = data, + .len = len, + .root = d, + .diff_flags = diff_flags, + }; +} + +void +diff_data_init_subsection(struct diff_data *d, struct diff_data *parent, + struct diff_atom *from_atom, unsigned int atoms_count) +{ + struct diff_atom *last_atom; + + debug("diff_data %p parent %p from_atom %p atoms_count %u\n", + d, parent, from_atom, atoms_count); + debug(" from_atom "); + debug_dump_atom(parent, NULL, from_atom); + + if (atoms_count == 0) { + *d = (struct diff_data){ + .f = NULL, + .pos = 0, + .data = NULL, + .len = 0, + .root = parent->root, + .atoms.head = NULL, + .atoms.len = atoms_count, + }; + + return; + } + + last_atom = from_atom + atoms_count - 1; + *d = (struct diff_data){ + .f = NULL, + .pos = from_atom->pos, + .data = from_atom->at, + .len = (last_atom->pos + last_atom->len) - from_atom->pos, + .root = parent->root, + .atoms.head = from_atom, + .atoms.len = atoms_count, + }; + + debug("subsection:\n"); + debug_dump(d); +} + +void +diff_data_free(struct diff_data *diff_data) +{ + if (!diff_data) + return; + if (diff_data->atoms.allocated) + ARRAYLIST_FREE(diff_data->atoms); +} + +int +diff_algo_none(const struct diff_algo_config *algo_config, + struct diff_state *state) +{ + debug("\n** %s\n", __func__); + debug("left:\n"); + debug_dump(&state->left); + debug("right:\n"); + debug_dump(&state->right); + debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL, + 0); + + /* Add a chunk of equal lines, if any */ + struct diff_atom *l = state->left.atoms.head; + unsigned int l_len = state->left.atoms.len; + struct diff_atom *r = state->right.atoms.head; + unsigned int r_len = state->right.atoms.len; + unsigned int equal_atoms_start = 0; + unsigned int equal_atoms_end = 0; + unsigned int l_idx = 0; + unsigned int r_idx = 0; + + while (equal_atoms_start < l_len + && equal_atoms_start < r_len) { + int err; + bool same; + err = diff_atom_same(&same, &l[equal_atoms_start], + &r[equal_atoms_start]); + if (err) + return err; + if (!same) + break; + equal_atoms_start++; + } + while (equal_atoms_end < (l_len - equal_atoms_start) + && equal_atoms_end < (r_len - equal_atoms_start)) { + int err; + bool same; + err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end], + &r[r_len - 1 - equal_atoms_end]); + if (err) + return err; + if (!same) + break; + equal_atoms_end++; + } + + /* Add a chunk of equal lines at the start */ + if (equal_atoms_start) { + if (!diff_state_add_chunk(state, true, + l, equal_atoms_start, + r, equal_atoms_start)) + return ENOMEM; + l_idx += equal_atoms_start; + r_idx += equal_atoms_start; + } + + /* Add a "minus" chunk with all lines from the left. */ + if (equal_atoms_start + equal_atoms_end < l_len) { + unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end; + if (!diff_state_add_chunk(state, true, + &l[l_idx], add_len, + &r[r_idx], 0)) + return ENOMEM; + l_idx += add_len; + } + + /* Add a "plus" chunk with all lines from the right. */ + if (equal_atoms_start + equal_atoms_end < r_len) { + unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end; + if (!diff_state_add_chunk(state, true, + &l[l_idx], 0, + &r[r_idx], add_len)) + return ENOMEM; + r_idx += add_len; + } + + /* Add a chunk of equal lines at the end */ + if (equal_atoms_end) { + if (!diff_state_add_chunk(state, true, + &l[l_idx], equal_atoms_end, + &r[r_idx], equal_atoms_end)) + return ENOMEM; + } + + return DIFF_RC_OK; +} + +static int +diff_run_algo(const struct diff_algo_config *algo_config, + struct diff_state *state) +{ + int rc; + + if (!algo_config || !algo_config->impl + || !state->recursion_depth_left + || !state->left.atoms.len || !state->right.atoms.len) { + debug("Fall back to diff_algo_none():%s%s%s\n", + (!algo_config || !algo_config->impl) ? " no-cfg" : "", + (!state->recursion_depth_left) ? " max-depth" : "", + (!state->left.atoms.len || !state->right.atoms.len)? + " trivial" : ""); + return diff_algo_none(algo_config, state); + } + + ARRAYLIST_FREE(state->temp_result); + ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE); + rc = algo_config->impl(algo_config, state); + switch (rc) { + case DIFF_RC_USE_DIFF_ALGO_FALLBACK: + debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", + algo_config->fallback_algo); + rc = diff_run_algo(algo_config->fallback_algo, state); + goto return_rc; + + case DIFF_RC_OK: + /* continue below */ + break; + + default: + /* some error happened */ + goto return_rc; + } + + /* Pick up any diff chunks that are still unsolved and feed to + * inner_algo. inner_algo will solve unsolved chunks and append to + * result, and subsequent solved chunks on this level are then appended + * to result afterwards. */ + int i; + for (i = 0; i < state->temp_result.len; i++) { + struct diff_chunk *c = &state->temp_result.head[i]; + if (c->solved) { + diff_state_add_solved_chunk(state, c); + continue; + } + + /* c is an unsolved chunk, feed to inner_algo */ + struct diff_state inner_state = { + .result = state->result, + .recursion_depth_left = state->recursion_depth_left - 1, + .kd_buf = state->kd_buf, + .kd_buf_size = state->kd_buf_size, + }; + diff_data_init_subsection(&inner_state.left, &state->left, + c->left_start, c->left_count); + diff_data_init_subsection(&inner_state.right, &state->right, + c->right_start, c->right_count); + + rc = diff_run_algo(algo_config->inner_algo, &inner_state); + state->kd_buf = inner_state.kd_buf; + state->kd_buf_size = inner_state.kd_buf_size; + if (rc != DIFF_RC_OK) + goto return_rc; + } + + rc = DIFF_RC_OK; +return_rc: + ARRAYLIST_FREE(state->temp_result); + return rc; +} + +int +diff_atomize_file(struct diff_data *d, + const struct diff_config *config, + FILE *f, const uint8_t *data, off_t len, int diff_flags) +{ + if (!config->atomize_func) + return EINVAL; + + diff_data_init_root(d, f, data, len, diff_flags); + + return config->atomize_func(config->atomize_func_data, d); + +} + +struct diff_result * +diff_main(const struct diff_config *config, struct diff_data *left, + struct diff_data *right) +{ + struct diff_result *result = malloc(sizeof(struct diff_result)); + if (!result) + return NULL; + + *result = (struct diff_result){}; + result->left = left; + result->right = right; + + struct diff_state state = { + .result = result, + .recursion_depth_left = config->max_recursion_depth ? + config->max_recursion_depth : UINT_MAX, + .kd_buf = NULL, + .kd_buf_size = 0, + }; + diff_data_init_subsection(&state.left, left, + left->atoms.head, + left->atoms.len); + diff_data_init_subsection(&state.right, right, + right->atoms.head, + right->atoms.len); + + result->rc = diff_run_algo(config->algo, &state); + free(state.kd_buf); + + return result; +} + +void +diff_result_free(struct diff_result *result) +{ + if (!result) + return; + ARRAYLIST_FREE(result->chunks); + free(result); +} + +int +diff_result_contains_printable_chunks(struct diff_result *result) +{ + struct diff_chunk *c; + enum diff_chunk_type t; + int i; + + for (i = 0; i < result->chunks.len; i++) { + c = &result->chunks.head[i]; + t = diff_chunk_type(c); + if (t == CHUNK_MINUS || t == CHUNK_PLUS) + return 1; + } + + return 0; +} |