diff options
Diffstat (limited to 'subversion/libsvn_diff/parse-diff.c')
-rw-r--r-- | subversion/libsvn_diff/parse-diff.c | 1373 |
1 files changed, 1373 insertions, 0 deletions
diff --git a/subversion/libsvn_diff/parse-diff.c b/subversion/libsvn_diff/parse-diff.c new file mode 100644 index 000000000000..a01b4d52743b --- /dev/null +++ b/subversion/libsvn_diff/parse-diff.c @@ -0,0 +1,1373 @@ +/* + * parse-diff.c: functions for parsing diff files + * + * ==================================================================== + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * ==================================================================== + */ + +#include <stdlib.h> +#include <stddef.h> +#include <string.h> + +#include "svn_hash.h" +#include "svn_types.h" +#include "svn_error.h" +#include "svn_io.h" +#include "svn_pools.h" +#include "svn_props.h" +#include "svn_string.h" +#include "svn_utf.h" +#include "svn_dirent_uri.h" +#include "svn_diff.h" + +#include "private/svn_eol_private.h" +#include "private/svn_dep_compat.h" + +/* Helper macro for readability */ +#define starts_with(str, start) \ + (strncmp((str), (start), strlen(start)) == 0) + +/* Like strlen() but for string literals. */ +#define STRLEN_LITERAL(str) (sizeof(str) - 1) + +/* This struct describes a range within a file, as well as the + * current cursor position within the range. All numbers are in bytes. */ +struct svn_diff__hunk_range { + apr_off_t start; + apr_off_t end; + apr_off_t current; +}; + +struct svn_diff_hunk_t { + /* The patch this hunk belongs to. */ + svn_patch_t *patch; + + /* APR file handle to the patch file this hunk came from. */ + apr_file_t *apr_file; + + /* Ranges used to keep track of this hunk's texts positions within + * the patch file. */ + struct svn_diff__hunk_range diff_text_range; + struct svn_diff__hunk_range original_text_range; + struct svn_diff__hunk_range modified_text_range; + + /* Hunk ranges as they appeared in the patch file. + * All numbers are lines, not bytes. */ + svn_linenum_t original_start; + svn_linenum_t original_length; + svn_linenum_t modified_start; + svn_linenum_t modified_length; + + /* Number of lines of leading and trailing hunk context. */ + svn_linenum_t leading_context; + svn_linenum_t trailing_context; +}; + +void +svn_diff_hunk_reset_diff_text(svn_diff_hunk_t *hunk) +{ + hunk->diff_text_range.current = hunk->diff_text_range.start; +} + +void +svn_diff_hunk_reset_original_text(svn_diff_hunk_t *hunk) +{ + if (hunk->patch->reverse) + hunk->modified_text_range.current = hunk->modified_text_range.start; + else + hunk->original_text_range.current = hunk->original_text_range.start; +} + +void +svn_diff_hunk_reset_modified_text(svn_diff_hunk_t *hunk) +{ + if (hunk->patch->reverse) + hunk->original_text_range.current = hunk->original_text_range.start; + else + hunk->modified_text_range.current = hunk->modified_text_range.start; +} + +svn_linenum_t +svn_diff_hunk_get_original_start(const svn_diff_hunk_t *hunk) +{ + return hunk->patch->reverse ? hunk->modified_start : hunk->original_start; +} + +svn_linenum_t +svn_diff_hunk_get_original_length(const svn_diff_hunk_t *hunk) +{ + return hunk->patch->reverse ? hunk->modified_length : hunk->original_length; +} + +svn_linenum_t +svn_diff_hunk_get_modified_start(const svn_diff_hunk_t *hunk) +{ + return hunk->patch->reverse ? hunk->original_start : hunk->modified_start; +} + +svn_linenum_t +svn_diff_hunk_get_modified_length(const svn_diff_hunk_t *hunk) +{ + return hunk->patch->reverse ? hunk->original_length : hunk->modified_length; +} + +svn_linenum_t +svn_diff_hunk_get_leading_context(const svn_diff_hunk_t *hunk) +{ + return hunk->leading_context; +} + +svn_linenum_t +svn_diff_hunk_get_trailing_context(const svn_diff_hunk_t *hunk) +{ + return hunk->trailing_context; +} + +/* Try to parse a positive number from a decimal number encoded + * in the string NUMBER. Return parsed number in OFFSET, and return + * TRUE if parsing was successful. */ +static svn_boolean_t +parse_offset(svn_linenum_t *offset, const char *number) +{ + svn_error_t *err; + apr_uint64_t val; + + err = svn_cstring_strtoui64(&val, number, 0, SVN_LINENUM_MAX_VALUE, 10); + if (err) + { + svn_error_clear(err); + return FALSE; + } + + *offset = (svn_linenum_t)val; + + return TRUE; +} + +/* Try to parse a hunk range specification from the string RANGE. + * Return parsed information in *START and *LENGTH, and return TRUE + * if the range parsed correctly. Note: This function may modify the + * input value RANGE. */ +static svn_boolean_t +parse_range(svn_linenum_t *start, svn_linenum_t *length, char *range) +{ + char *comma; + + if (*range == 0) + return FALSE; + + comma = strstr(range, ","); + if (comma) + { + if (strlen(comma + 1) > 0) + { + /* Try to parse the length. */ + if (! parse_offset(length, comma + 1)) + return FALSE; + + /* Snip off the end of the string, + * so we can comfortably parse the line + * number the hunk starts at. */ + *comma = '\0'; + } + else + /* A comma but no length? */ + return FALSE; + } + else + { + *length = 1; + } + + /* Try to parse the line number the hunk starts at. */ + return parse_offset(start, range); +} + +/* Try to parse a hunk header in string HEADER, putting parsed information + * into HUNK. Return TRUE if the header parsed correctly. ATAT is the + * character string used to delimit the hunk header. + * Do all allocations in POOL. */ +static svn_boolean_t +parse_hunk_header(const char *header, svn_diff_hunk_t *hunk, + const char *atat, apr_pool_t *pool) +{ + const char *p; + const char *start; + svn_stringbuf_t *range; + + p = header + strlen(atat); + if (*p != ' ') + /* No. */ + return FALSE; + p++; + if (*p != '-') + /* Nah... */ + return FALSE; + /* OK, this may be worth allocating some memory for... */ + range = svn_stringbuf_create_ensure(31, pool); + start = ++p; + while (*p && *p != ' ') + { + p++; + } + + if (*p != ' ') + /* No no no... */ + return FALSE; + + svn_stringbuf_appendbytes(range, start, p - start); + + /* Try to parse the first range. */ + if (! parse_range(&hunk->original_start, &hunk->original_length, range->data)) + return FALSE; + + /* Clear the stringbuf so we can reuse it for the second range. */ + svn_stringbuf_setempty(range); + p++; + if (*p != '+') + /* Eeek! */ + return FALSE; + /* OK, this may be worth copying... */ + start = ++p; + while (*p && *p != ' ') + { + p++; + } + if (*p != ' ') + /* No no no... */ + return FALSE; + + svn_stringbuf_appendbytes(range, start, p - start); + + /* Check for trailing @@ */ + p++; + if (! starts_with(p, atat)) + return FALSE; + + /* There may be stuff like C-function names after the trailing @@, + * but we ignore that. */ + + /* Try to parse the second range. */ + if (! parse_range(&hunk->modified_start, &hunk->modified_length, range->data)) + return FALSE; + + /* Hunk header is good. */ + return TRUE; +} + +/* Read a line of original or modified hunk text from the specified + * RANGE within FILE. FILE is expected to contain unidiff text. + * Leading unidiff symbols ('+', '-', and ' ') are removed from the line, + * Any lines commencing with the VERBOTEN character are discarded. + * VERBOTEN should be '+' or '-', depending on which form of hunk text + * is being read. + * + * All other parameters are as in svn_diff_hunk_readline_original_text() + * and svn_diff_hunk_readline_modified_text(). + */ +static svn_error_t * +hunk_readline_original_or_modified(apr_file_t *file, + struct svn_diff__hunk_range *range, + svn_stringbuf_t **stringbuf, + const char **eol, + svn_boolean_t *eof, + char verboten, + apr_pool_t *result_pool, + apr_pool_t *scratch_pool) +{ + apr_size_t max_len; + svn_boolean_t filtered; + apr_off_t pos; + svn_stringbuf_t *str; + + if (range->current >= range->end) + { + /* We're past the range. Indicate that no bytes can be read. */ + *eof = TRUE; + if (eol) + *eol = NULL; + *stringbuf = svn_stringbuf_create_empty(result_pool); + return SVN_NO_ERROR; + } + + pos = 0; + SVN_ERR(svn_io_file_seek(file, APR_CUR, &pos, scratch_pool)); + SVN_ERR(svn_io_file_seek(file, APR_SET, &range->current, scratch_pool)); + do + { + max_len = range->end - range->current; + SVN_ERR(svn_io_file_readline(file, &str, eol, eof, max_len, + result_pool, scratch_pool)); + range->current = 0; + SVN_ERR(svn_io_file_seek(file, APR_CUR, &range->current, scratch_pool)); + filtered = (str->data[0] == verboten || str->data[0] == '\\'); + } + while (filtered && ! *eof); + + if (filtered) + { + /* EOF, return an empty string. */ + *stringbuf = svn_stringbuf_create_ensure(0, result_pool); + } + else if (str->data[0] == '+' || str->data[0] == '-' || str->data[0] == ' ') + { + /* Shave off leading unidiff symbols. */ + *stringbuf = svn_stringbuf_create(str->data + 1, result_pool); + } + else + { + /* Return the line as-is. */ + *stringbuf = svn_stringbuf_dup(str, result_pool); + } + + SVN_ERR(svn_io_file_seek(file, APR_SET, &pos, scratch_pool)); + + return SVN_NO_ERROR; +} + +svn_error_t * +svn_diff_hunk_readline_original_text(svn_diff_hunk_t *hunk, + svn_stringbuf_t **stringbuf, + const char **eol, + svn_boolean_t *eof, + apr_pool_t *result_pool, + apr_pool_t *scratch_pool) +{ + return svn_error_trace( + hunk_readline_original_or_modified(hunk->apr_file, + hunk->patch->reverse ? + &hunk->modified_text_range : + &hunk->original_text_range, + stringbuf, eol, eof, + hunk->patch->reverse ? '-' : '+', + result_pool, scratch_pool)); +} + +svn_error_t * +svn_diff_hunk_readline_modified_text(svn_diff_hunk_t *hunk, + svn_stringbuf_t **stringbuf, + const char **eol, + svn_boolean_t *eof, + apr_pool_t *result_pool, + apr_pool_t *scratch_pool) +{ + return svn_error_trace( + hunk_readline_original_or_modified(hunk->apr_file, + hunk->patch->reverse ? + &hunk->original_text_range : + &hunk->modified_text_range, + stringbuf, eol, eof, + hunk->patch->reverse ? '+' : '-', + result_pool, scratch_pool)); +} + +svn_error_t * +svn_diff_hunk_readline_diff_text(svn_diff_hunk_t *hunk, + svn_stringbuf_t **stringbuf, + const char **eol, + svn_boolean_t *eof, + apr_pool_t *result_pool, + apr_pool_t *scratch_pool) +{ + svn_diff_hunk_t dummy; + svn_stringbuf_t *line; + apr_size_t max_len; + apr_off_t pos; + + if (hunk->diff_text_range.current >= hunk->diff_text_range.end) + { + /* We're past the range. Indicate that no bytes can be read. */ + *eof = TRUE; + if (eol) + *eol = NULL; + *stringbuf = svn_stringbuf_create_empty(result_pool); + return SVN_NO_ERROR; + } + + pos = 0; + SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR, &pos, scratch_pool)); + SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET, + &hunk->diff_text_range.current, scratch_pool)); + max_len = hunk->diff_text_range.end - hunk->diff_text_range.current; + SVN_ERR(svn_io_file_readline(hunk->apr_file, &line, eol, eof, max_len, + result_pool, + scratch_pool)); + hunk->diff_text_range.current = 0; + SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR, + &hunk->diff_text_range.current, scratch_pool)); + SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET, &pos, scratch_pool)); + + if (hunk->patch->reverse) + { + if (parse_hunk_header(line->data, &dummy, "@@", scratch_pool)) + { + /* Line is a hunk header, reverse it. */ + line = svn_stringbuf_createf(result_pool, + "@@ -%lu,%lu +%lu,%lu @@", + hunk->modified_start, + hunk->modified_length, + hunk->original_start, + hunk->original_length); + } + else if (parse_hunk_header(line->data, &dummy, "##", scratch_pool)) + { + /* Line is a hunk header, reverse it. */ + line = svn_stringbuf_createf(result_pool, + "## -%lu,%lu +%lu,%lu ##", + hunk->modified_start, + hunk->modified_length, + hunk->original_start, + hunk->original_length); + } + else + { + if (line->data[0] == '+') + line->data[0] = '-'; + else if (line->data[0] == '-') + line->data[0] = '+'; + } + } + + *stringbuf = line; + + return SVN_NO_ERROR; +} + +/* Parse *PROP_NAME from HEADER as the part after the INDICATOR line. + * Allocate *PROP_NAME in RESULT_POOL. + * Set *PROP_NAME to NULL if no valid property name was found. */ +static svn_error_t * +parse_prop_name(const char **prop_name, const char *header, + const char *indicator, apr_pool_t *result_pool) +{ + SVN_ERR(svn_utf_cstring_to_utf8(prop_name, + header + strlen(indicator), + result_pool)); + if (**prop_name == '\0') + *prop_name = NULL; + else if (! svn_prop_name_is_valid(*prop_name)) + { + svn_stringbuf_t *buf = svn_stringbuf_create(*prop_name, result_pool); + svn_stringbuf_strip_whitespace(buf); + *prop_name = (svn_prop_name_is_valid(buf->data) ? buf->data : NULL); + } + + return SVN_NO_ERROR; +} + +/* Return the next *HUNK from a PATCH in APR_FILE. + * If no hunk can be found, set *HUNK to NULL. + * Set IS_PROPERTY to TRUE if we have a property hunk. If the returned HUNK + * is the first belonging to a certain property, then PROP_NAME and + * PROP_OPERATION will be set too. If we have a text hunk, PROP_NAME will be + * NULL. If IGNORE_WHITESPACE is TRUE, lines without leading spaces will be + * treated as context lines. Allocate results in RESULT_POOL. + * Use SCRATCH_POOL for all other allocations. */ +static svn_error_t * +parse_next_hunk(svn_diff_hunk_t **hunk, + svn_boolean_t *is_property, + const char **prop_name, + svn_diff_operation_kind_t *prop_operation, + svn_patch_t *patch, + apr_file_t *apr_file, + svn_boolean_t ignore_whitespace, + apr_pool_t *result_pool, + apr_pool_t *scratch_pool) +{ + static const char * const minus = "--- "; + static const char * const text_atat = "@@"; + static const char * const prop_atat = "##"; + svn_stringbuf_t *line; + svn_boolean_t eof, in_hunk, hunk_seen; + apr_off_t pos, last_line; + apr_off_t start, end; + apr_off_t original_end; + apr_off_t modified_end; + svn_linenum_t original_lines; + svn_linenum_t modified_lines; + svn_linenum_t leading_context; + svn_linenum_t trailing_context; + svn_boolean_t changed_line_seen; + enum { + noise_line, + original_line, + modified_line, + context_line + } last_line_type; + apr_pool_t *iterpool; + + *prop_operation = svn_diff_op_unchanged; + + /* We only set this if we have a property hunk header. */ + *prop_name = NULL; + *is_property = FALSE; + + if (apr_file_eof(apr_file) == APR_EOF) + { + /* No more hunks here. */ + *hunk = NULL; + return SVN_NO_ERROR; + } + + in_hunk = FALSE; + hunk_seen = FALSE; + leading_context = 0; + trailing_context = 0; + changed_line_seen = FALSE; + original_end = 0; + modified_end = 0; + *hunk = apr_pcalloc(result_pool, sizeof(**hunk)); + + /* Get current seek position -- APR has no ftell() :( */ + pos = 0; + SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, scratch_pool)); + + /* Start out assuming noise. */ + last_line_type = noise_line; + + iterpool = svn_pool_create(scratch_pool); + do + { + + svn_pool_clear(iterpool); + + /* Remember the current line's offset, and read the line. */ + last_line = pos; + SVN_ERR(svn_io_file_readline(apr_file, &line, NULL, &eof, APR_SIZE_MAX, + iterpool, iterpool)); + + /* Update line offset for next iteration. */ + pos = 0; + SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, iterpool)); + + /* Lines starting with a backslash indicate a missing EOL: + * "\ No newline at end of file" or "end of property". */ + if (line->data[0] == '\\') + { + if (in_hunk) + { + char eolbuf[2]; + apr_size_t len; + apr_off_t off; + apr_off_t hunk_text_end; + + /* Comment terminates the hunk text and says the hunk text + * has no trailing EOL. Snip off trailing EOL which is part + * of the patch file but not part of the hunk text. */ + off = last_line - 2; + SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &off, iterpool)); + len = sizeof(eolbuf); + SVN_ERR(svn_io_file_read_full2(apr_file, eolbuf, len, &len, + &eof, iterpool)); + if (eolbuf[0] == '\r' && eolbuf[1] == '\n') + hunk_text_end = last_line - 2; + else if (eolbuf[1] == '\n' || eolbuf[1] == '\r') + hunk_text_end = last_line - 1; + else + hunk_text_end = last_line; + + if (last_line_type == original_line && original_end == 0) + original_end = hunk_text_end; + else if (last_line_type == modified_line && modified_end == 0) + modified_end = hunk_text_end; + else if (last_line_type == context_line) + { + if (original_end == 0) + original_end = hunk_text_end; + if (modified_end == 0) + modified_end = hunk_text_end; + } + + SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &pos, iterpool)); + } + + continue; + } + + if (in_hunk) + { + char c; + static const char add = '+'; + static const char del = '-'; + + if (! hunk_seen) + { + /* We're reading the first line of the hunk, so the start + * of the line just read is the hunk text's byte offset. */ + start = last_line; + } + + c = line->data[0]; + if (original_lines > 0 && modified_lines > 0 && + ((c == ' ') + /* Tolerate chopped leading spaces on empty lines. */ + || (! eof && line->len == 0) + /* Maybe tolerate chopped leading spaces on non-empty lines. */ + || (ignore_whitespace && c != del && c != add))) + { + /* It's a "context" line in the hunk. */ + hunk_seen = TRUE; + original_lines--; + modified_lines--; + if (changed_line_seen) + trailing_context++; + else + leading_context++; + last_line_type = context_line; + } + else if (original_lines > 0 && c == del) + { + /* It's a "deleted" line in the hunk. */ + hunk_seen = TRUE; + changed_line_seen = TRUE; + + /* A hunk may have context in the middle. We only want + trailing lines of context. */ + if (trailing_context > 0) + trailing_context = 0; + + original_lines--; + last_line_type = original_line; + } + else if (modified_lines > 0 && c == add) + { + /* It's an "added" line in the hunk. */ + hunk_seen = TRUE; + changed_line_seen = TRUE; + + /* A hunk may have context in the middle. We only want + trailing lines of context. */ + if (trailing_context > 0) + trailing_context = 0; + + modified_lines--; + last_line_type = modified_line; + } + else + { + if (eof) + { + /* The hunk ends at EOF. */ + end = pos; + } + else + { + /* The start of the current line marks the first byte + * after the hunk text. */ + end = last_line; + } + + if (original_end == 0) + original_end = end; + if (modified_end == 0) + modified_end = end; + break; /* Hunk was empty or has been read. */ + } + } + else + { + if (starts_with(line->data, text_atat)) + { + /* Looks like we have a hunk header, try to rip it apart. */ + in_hunk = parse_hunk_header(line->data, *hunk, text_atat, + iterpool); + if (in_hunk) + { + original_lines = (*hunk)->original_length; + modified_lines = (*hunk)->modified_length; + *is_property = FALSE; + } + } + else if (starts_with(line->data, prop_atat)) + { + /* Looks like we have a property hunk header, try to rip it + * apart. */ + in_hunk = parse_hunk_header(line->data, *hunk, prop_atat, + iterpool); + if (in_hunk) + { + original_lines = (*hunk)->original_length; + modified_lines = (*hunk)->modified_length; + *is_property = TRUE; + } + } + else if (starts_with(line->data, "Added: ")) + { + SVN_ERR(parse_prop_name(prop_name, line->data, "Added: ", + result_pool)); + if (*prop_name) + *prop_operation = svn_diff_op_added; + } + else if (starts_with(line->data, "Deleted: ")) + { + SVN_ERR(parse_prop_name(prop_name, line->data, "Deleted: ", + result_pool)); + if (*prop_name) + *prop_operation = svn_diff_op_deleted; + } + else if (starts_with(line->data, "Modified: ")) + { + SVN_ERR(parse_prop_name(prop_name, line->data, "Modified: ", + result_pool)); + if (*prop_name) + *prop_operation = svn_diff_op_modified; + } + else if (starts_with(line->data, minus) + || starts_with(line->data, "diff --git ")) + /* This could be a header of another patch. Bail out. */ + break; + } + } + /* Check for the line length since a file may not have a newline at the + * end and we depend upon the last line to be an empty one. */ + while (! eof || line->len > 0); + svn_pool_destroy(iterpool); + + if (! eof) + /* Rewind to the start of the line just read, so subsequent calls + * to this function or svn_diff_parse_next_patch() don't end + * up skipping the line -- it may contain a patch or hunk header. */ + SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &last_line, scratch_pool)); + + if (hunk_seen && start < end) + { + (*hunk)->patch = patch; + (*hunk)->apr_file = apr_file; + (*hunk)->leading_context = leading_context; + (*hunk)->trailing_context = trailing_context; + (*hunk)->diff_text_range.start = start; + (*hunk)->diff_text_range.current = start; + (*hunk)->diff_text_range.end = end; + (*hunk)->original_text_range.start = start; + (*hunk)->original_text_range.current = start; + (*hunk)->original_text_range.end = original_end; + (*hunk)->modified_text_range.start = start; + (*hunk)->modified_text_range.current = start; + (*hunk)->modified_text_range.end = modified_end; + } + else + /* Something went wrong, just discard the result. */ + *hunk = NULL; + + return SVN_NO_ERROR; +} + +/* Compare function for sorting hunks after parsing. + * We sort hunks by their original line offset. */ +static int +compare_hunks(const void *a, const void *b) +{ + const svn_diff_hunk_t *ha = *((const svn_diff_hunk_t *const *)a); + const svn_diff_hunk_t *hb = *((const svn_diff_hunk_t *const *)b); + + if (ha->original_start < hb->original_start) + return -1; + if (ha->original_start > hb->original_start) + return 1; + return 0; +} + +/* Possible states of the diff header parser. */ +enum parse_state +{ + state_start, /* initial */ + state_git_diff_seen, /* diff --git */ + state_git_tree_seen, /* a tree operation, rather then content change */ + state_git_minus_seen, /* --- /dev/null; or --- a/ */ + state_git_plus_seen, /* +++ /dev/null; or +++ a/ */ + state_move_from_seen, /* rename from foo.c */ + state_copy_from_seen, /* copy from foo.c */ + state_minus_seen, /* --- foo.c */ + state_unidiff_found, /* valid start of a regular unidiff header */ + state_git_header_found /* valid start of a --git diff header */ +}; + +/* Data type describing a valid state transition of the parser. */ +struct transition +{ + const char *expected_input; + enum parse_state required_state; + + /* A callback called upon each parser state transition. */ + svn_error_t *(*fn)(enum parse_state *new_state, char *input, + svn_patch_t *patch, apr_pool_t *result_pool, + apr_pool_t *scratch_pool); +}; + +/* UTF-8 encode and canonicalize the content of LINE as FILE_NAME. */ +static svn_error_t * +grab_filename(const char **file_name, const char *line, apr_pool_t *result_pool, + apr_pool_t *scratch_pool) +{ + const char *utf8_path; + const char *canon_path; + + /* Grab the filename and encode it in UTF-8. */ + /* TODO: Allow specifying the patch file's encoding. + * For now, we assume its encoding is native. */ + /* ### This can fail if the filename cannot be represented in the current + * ### locale's encoding. */ + SVN_ERR(svn_utf_cstring_to_utf8(&utf8_path, + line, + scratch_pool)); + + /* Canonicalize the path name. */ + canon_path = svn_dirent_canonicalize(utf8_path, scratch_pool); + + *file_name = apr_pstrdup(result_pool, canon_path); + + return SVN_NO_ERROR; +} + +/* Parse the '--- ' line of a regular unidiff. */ +static svn_error_t * +diff_minus(enum parse_state *new_state, char *line, svn_patch_t *patch, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + /* If we can find a tab, it separates the filename from + * the rest of the line which we can discard. */ + char *tab = strchr(line, '\t'); + if (tab) + *tab = '\0'; + + SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- "), + result_pool, scratch_pool)); + + *new_state = state_minus_seen; + + return SVN_NO_ERROR; +} + +/* Parse the '+++ ' line of a regular unidiff. */ +static svn_error_t * +diff_plus(enum parse_state *new_state, char *line, svn_patch_t *patch, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + /* If we can find a tab, it separates the filename from + * the rest of the line which we can discard. */ + char *tab = strchr(line, '\t'); + if (tab) + *tab = '\0'; + + SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ "), + result_pool, scratch_pool)); + + *new_state = state_unidiff_found; + + return SVN_NO_ERROR; +} + +/* Parse the first line of a git extended unidiff. */ +static svn_error_t * +git_start(enum parse_state *new_state, char *line, svn_patch_t *patch, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + const char *old_path_start; + char *old_path_end; + const char *new_path_start; + const char *new_path_end; + char *new_path_marker; + const char *old_path_marker; + + /* ### Add handling of escaped paths + * http://www.kernel.org/pub/software/scm/git/docs/git-diff.html: + * + * TAB, LF, double quote and backslash characters in pathnames are + * represented as \t, \n, \" and \\, respectively. If there is need for + * such substitution then the whole pathname is put in double quotes. + */ + + /* Our line should look like this: 'diff --git a/path b/path'. + * + * If we find any deviations from that format, we return with state reset + * to start. + */ + old_path_marker = strstr(line, " a/"); + + if (! old_path_marker) + { + *new_state = state_start; + return SVN_NO_ERROR; + } + + if (! *(old_path_marker + 3)) + { + *new_state = state_start; + return SVN_NO_ERROR; + } + + new_path_marker = strstr(old_path_marker, " b/"); + + if (! new_path_marker) + { + *new_state = state_start; + return SVN_NO_ERROR; + } + + if (! *(new_path_marker + 3)) + { + *new_state = state_start; + return SVN_NO_ERROR; + } + + /* By now, we know that we have a line on the form '--git diff a/.+ b/.+' + * We only need the filenames when we have deleted or added empty + * files. In those cases the old_path and new_path is identical on the + * 'diff --git' line. For all other cases we fetch the filenames from + * other header lines. */ + old_path_start = line + STRLEN_LITERAL("diff --git a/"); + new_path_end = line + strlen(line); + new_path_start = old_path_start; + + while (TRUE) + { + ptrdiff_t len_old; + ptrdiff_t len_new; + + new_path_marker = strstr(new_path_start, " b/"); + + /* No new path marker, bail out. */ + if (! new_path_marker) + break; + + old_path_end = new_path_marker; + new_path_start = new_path_marker + STRLEN_LITERAL(" b/"); + + /* No path after the marker. */ + if (! *new_path_start) + break; + + len_old = old_path_end - old_path_start; + len_new = new_path_end - new_path_start; + + /* Are the paths before and after the " b/" marker the same? */ + if (len_old == len_new + && ! strncmp(old_path_start, new_path_start, len_old)) + { + *old_path_end = '\0'; + SVN_ERR(grab_filename(&patch->old_filename, old_path_start, + result_pool, scratch_pool)); + + SVN_ERR(grab_filename(&patch->new_filename, new_path_start, + result_pool, scratch_pool)); + break; + } + } + + /* We assume that the path is only modified until we've found a 'tree' + * header */ + patch->operation = svn_diff_op_modified; + + *new_state = state_git_diff_seen; + return SVN_NO_ERROR; +} + +/* Parse the '--- ' line of a git extended unidiff. */ +static svn_error_t * +git_minus(enum parse_state *new_state, char *line, svn_patch_t *patch, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + /* If we can find a tab, it separates the filename from + * the rest of the line which we can discard. */ + char *tab = strchr(line, '\t'); + if (tab) + *tab = '\0'; + + if (starts_with(line, "--- /dev/null")) + SVN_ERR(grab_filename(&patch->old_filename, "/dev/null", + result_pool, scratch_pool)); + else + SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- a/"), + result_pool, scratch_pool)); + + *new_state = state_git_minus_seen; + return SVN_NO_ERROR; +} + +/* Parse the '+++ ' line of a git extended unidiff. */ +static svn_error_t * +git_plus(enum parse_state *new_state, char *line, svn_patch_t *patch, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + /* If we can find a tab, it separates the filename from + * the rest of the line which we can discard. */ + char *tab = strchr(line, '\t'); + if (tab) + *tab = '\0'; + + if (starts_with(line, "+++ /dev/null")) + SVN_ERR(grab_filename(&patch->new_filename, "/dev/null", + result_pool, scratch_pool)); + else + SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ b/"), + result_pool, scratch_pool)); + + *new_state = state_git_header_found; + return SVN_NO_ERROR; +} + +/* Parse the 'rename from ' line of a git extended unidiff. */ +static svn_error_t * +git_move_from(enum parse_state *new_state, char *line, svn_patch_t *patch, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + SVN_ERR(grab_filename(&patch->old_filename, + line + STRLEN_LITERAL("rename from "), + result_pool, scratch_pool)); + + *new_state = state_move_from_seen; + return SVN_NO_ERROR; +} + +/* Parse the 'rename to ' line of a git extended unidiff. */ +static svn_error_t * +git_move_to(enum parse_state *new_state, char *line, svn_patch_t *patch, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + SVN_ERR(grab_filename(&patch->new_filename, + line + STRLEN_LITERAL("rename to "), + result_pool, scratch_pool)); + + patch->operation = svn_diff_op_moved; + + *new_state = state_git_tree_seen; + return SVN_NO_ERROR; +} + +/* Parse the 'copy from ' line of a git extended unidiff. */ +static svn_error_t * +git_copy_from(enum parse_state *new_state, char *line, svn_patch_t *patch, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + SVN_ERR(grab_filename(&patch->old_filename, + line + STRLEN_LITERAL("copy from "), + result_pool, scratch_pool)); + + *new_state = state_copy_from_seen; + return SVN_NO_ERROR; +} + +/* Parse the 'copy to ' line of a git extended unidiff. */ +static svn_error_t * +git_copy_to(enum parse_state *new_state, char *line, svn_patch_t *patch, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("copy to "), + result_pool, scratch_pool)); + + patch->operation = svn_diff_op_copied; + + *new_state = state_git_tree_seen; + return SVN_NO_ERROR; +} + +/* Parse the 'new file ' line of a git extended unidiff. */ +static svn_error_t * +git_new_file(enum parse_state *new_state, char *line, svn_patch_t *patch, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + patch->operation = svn_diff_op_added; + + /* Filename already retrieved from diff --git header. */ + + *new_state = state_git_tree_seen; + return SVN_NO_ERROR; +} + +/* Parse the 'deleted file ' line of a git extended unidiff. */ +static svn_error_t * +git_deleted_file(enum parse_state *new_state, char *line, svn_patch_t *patch, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + patch->operation = svn_diff_op_deleted; + + /* Filename already retrieved from diff --git header. */ + + *new_state = state_git_tree_seen; + return SVN_NO_ERROR; +} + +/* Add a HUNK associated with the property PROP_NAME to PATCH. */ +static svn_error_t * +add_property_hunk(svn_patch_t *patch, const char *prop_name, + svn_diff_hunk_t *hunk, svn_diff_operation_kind_t operation, + apr_pool_t *result_pool) +{ + svn_prop_patch_t *prop_patch; + + prop_patch = svn_hash_gets(patch->prop_patches, prop_name); + + if (! prop_patch) + { + prop_patch = apr_palloc(result_pool, sizeof(svn_prop_patch_t)); + prop_patch->name = prop_name; + prop_patch->operation = operation; + prop_patch->hunks = apr_array_make(result_pool, 1, + sizeof(svn_diff_hunk_t *)); + + svn_hash_sets(patch->prop_patches, prop_name, prop_patch); + } + + APR_ARRAY_PUSH(prop_patch->hunks, svn_diff_hunk_t *) = hunk; + + return SVN_NO_ERROR; +} + +struct svn_patch_file_t +{ + /* The APR file handle to the patch file. */ + apr_file_t *apr_file; + + /* The file offset at which the next patch is expected. */ + apr_off_t next_patch_offset; +}; + +svn_error_t * +svn_diff_open_patch_file(svn_patch_file_t **patch_file, + const char *local_abspath, + apr_pool_t *result_pool) +{ + svn_patch_file_t *p; + + p = apr_palloc(result_pool, sizeof(*p)); + SVN_ERR(svn_io_file_open(&p->apr_file, local_abspath, + APR_READ | APR_BUFFERED, APR_OS_DEFAULT, + result_pool)); + p->next_patch_offset = 0; + *patch_file = p; + + return SVN_NO_ERROR; +} + +/* Parse hunks from APR_FILE and store them in PATCH->HUNKS. + * Parsing stops if no valid next hunk can be found. + * If IGNORE_WHITESPACE is TRUE, lines without + * leading spaces will be treated as context lines. + * Allocate results in RESULT_POOL. + * Use SCRATCH_POOL for temporary allocations. */ +static svn_error_t * +parse_hunks(svn_patch_t *patch, apr_file_t *apr_file, + svn_boolean_t ignore_whitespace, + apr_pool_t *result_pool, apr_pool_t *scratch_pool) +{ + svn_diff_hunk_t *hunk; + svn_boolean_t is_property; + const char *last_prop_name; + const char *prop_name; + svn_diff_operation_kind_t prop_operation; + apr_pool_t *iterpool; + + last_prop_name = NULL; + + patch->hunks = apr_array_make(result_pool, 10, sizeof(svn_diff_hunk_t *)); + patch->prop_patches = apr_hash_make(result_pool); + iterpool = svn_pool_create(scratch_pool); + do + { + svn_pool_clear(iterpool); + + SVN_ERR(parse_next_hunk(&hunk, &is_property, &prop_name, &prop_operation, + patch, apr_file, ignore_whitespace, result_pool, + iterpool)); + + if (hunk && is_property) + { + if (! prop_name) + prop_name = last_prop_name; + else + last_prop_name = prop_name; + SVN_ERR(add_property_hunk(patch, prop_name, hunk, prop_operation, + result_pool)); + } + else if (hunk) + { + APR_ARRAY_PUSH(patch->hunks, svn_diff_hunk_t *) = hunk; + last_prop_name = NULL; + } + + } + while (hunk); + svn_pool_destroy(iterpool); + + return SVN_NO_ERROR; +} + +/* State machine for the diff header parser. + * Expected Input Required state Function to call */ +static struct transition transitions[] = +{ + {"--- ", state_start, diff_minus}, + {"+++ ", state_minus_seen, diff_plus}, + {"diff --git", state_start, git_start}, + {"--- a/", state_git_diff_seen, git_minus}, + {"--- a/", state_git_tree_seen, git_minus}, + {"--- /dev/null", state_git_tree_seen, git_minus}, + {"+++ b/", state_git_minus_seen, git_plus}, + {"+++ /dev/null", state_git_minus_seen, git_plus}, + {"rename from ", state_git_diff_seen, git_move_from}, + {"rename to ", state_move_from_seen, git_move_to}, + {"copy from ", state_git_diff_seen, git_copy_from}, + {"copy to ", state_copy_from_seen, git_copy_to}, + {"new file ", state_git_diff_seen, git_new_file}, + {"deleted file ", state_git_diff_seen, git_deleted_file}, +}; + +svn_error_t * +svn_diff_parse_next_patch(svn_patch_t **patch, + svn_patch_file_t *patch_file, + svn_boolean_t reverse, + svn_boolean_t ignore_whitespace, + apr_pool_t *result_pool, + apr_pool_t *scratch_pool) +{ + apr_off_t pos, last_line; + svn_boolean_t eof; + svn_boolean_t line_after_tree_header_read = FALSE; + apr_pool_t *iterpool; + enum parse_state state = state_start; + + if (apr_file_eof(patch_file->apr_file) == APR_EOF) + { + /* No more patches here. */ + *patch = NULL; + return SVN_NO_ERROR; + } + + *patch = apr_pcalloc(result_pool, sizeof(**patch)); + + pos = patch_file->next_patch_offset; + SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &pos, scratch_pool)); + + iterpool = svn_pool_create(scratch_pool); + do + { + svn_stringbuf_t *line; + svn_boolean_t valid_header_line = FALSE; + int i; + + svn_pool_clear(iterpool); + + /* Remember the current line's offset, and read the line. */ + last_line = pos; + SVN_ERR(svn_io_file_readline(patch_file->apr_file, &line, NULL, &eof, + APR_SIZE_MAX, iterpool, iterpool)); + + if (! eof) + { + /* Update line offset for next iteration. */ + pos = 0; + SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR, &pos, + iterpool)); + } + + /* Run the state machine. */ + for (i = 0; i < (sizeof(transitions) / sizeof(transitions[0])); i++) + { + if (starts_with(line->data, transitions[i].expected_input) + && state == transitions[i].required_state) + { + SVN_ERR(transitions[i].fn(&state, line->data, *patch, + result_pool, iterpool)); + valid_header_line = TRUE; + break; + } + } + + if (state == state_unidiff_found || state == state_git_header_found) + { + /* We have a valid diff header, yay! */ + break; + } + else if (state == state_git_tree_seen && line_after_tree_header_read) + { + /* git patches can contain an index line after the file mode line */ + if (!starts_with(line->data, "index ")) + { + /* We have a valid diff header for a patch with only tree changes. + * Rewind to the start of the line just read, so subsequent calls + * to this function don't end up skipping the line -- it may + * contain a patch. */ + SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line, + scratch_pool)); + break; + } + } + else if (state == state_git_tree_seen) + { + line_after_tree_header_read = TRUE; + } + else if (! valid_header_line && state != state_start + && !starts_with(line->data, "index ")) + { + /* We've encountered an invalid diff header. + * + * Rewind to the start of the line just read - it may be a new + * header that begins there. */ + SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line, + scratch_pool)); + state = state_start; + } + + } + while (! eof); + + (*patch)->reverse = reverse; + if (reverse) + { + const char *temp; + temp = (*patch)->old_filename; + (*patch)->old_filename = (*patch)->new_filename; + (*patch)->new_filename = temp; + } + + if ((*patch)->old_filename == NULL || (*patch)->new_filename == NULL) + { + /* Something went wrong, just discard the result. */ + *patch = NULL; + } + else + SVN_ERR(parse_hunks(*patch, patch_file->apr_file, ignore_whitespace, + result_pool, iterpool)); + + svn_pool_destroy(iterpool); + + patch_file->next_patch_offset = 0; + SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR, + &patch_file->next_patch_offset, scratch_pool)); + + if (*patch) + { + /* Usually, hunks appear in the patch sorted by their original line + * offset. But just in case they weren't parsed in this order for + * some reason, we sort them so that our caller can assume that hunks + * are sorted as if parsed from a usual patch. */ + qsort((*patch)->hunks->elts, (*patch)->hunks->nelts, + (*patch)->hunks->elt_size, compare_hunks); + } + + return SVN_NO_ERROR; +} + +svn_error_t * +svn_diff_close_patch_file(svn_patch_file_t *patch_file, + apr_pool_t *scratch_pool) +{ + return svn_error_trace(svn_io_file_close(patch_file->apr_file, + scratch_pool)); +} |