diff options
author | Andrey A. Chernov <ache@FreeBSD.org> | 2016-07-14 09:19:53 +0000 |
---|---|---|
committer | Andrey A. Chernov <ache@FreeBSD.org> | 2016-07-14 09:19:53 +0000 |
commit | 5b4fa425ba608143d8cf2d1201dbd7acae1ab23e (patch) | |
tree | 58196d01acff3c29cb50b5981667bad247987b6f | |
parent | 12eae8c8f346cb459a388259ca98faebdac47038 (diff) | |
download | src-5b4fa425ba608143d8cf2d1201dbd7acae1ab23e.tar.gz src-5b4fa425ba608143d8cf2d1201dbd7acae1ab23e.zip |
Notes
-rw-r--r-- | usr.bin/tr/str.c | 44 | ||||
-rw-r--r-- | usr.bin/tr/tr.1 | 21 | ||||
-rw-r--r-- | usr.bin/tr/tr.c | 45 |
3 files changed, 91 insertions, 19 deletions
diff --git a/usr.bin/tr/str.c b/usr.bin/tr/str.c index 0f686c4d1e12..333ca5c49831 100644 --- a/usr.bin/tr/str.c +++ b/usr.bin/tr/str.c @@ -53,7 +53,7 @@ static int backslash(STR *, int *); static int bracket(STR *); static void genclass(STR *); static void genequiv(STR *); -static int genrange(STR *); +static int genrange(STR *, int); static void genseq(STR *); wint_t @@ -93,7 +93,7 @@ next(STR *s) } /* We can start a range at any time. */ - if (s->str[0] == '-' && genrange(s)) + if (s->str[0] == '-' && genrange(s, is_octal)) return (next(s)); return (1); case RANGE: @@ -237,16 +237,18 @@ genequiv(STR *s) } static int -genrange(STR *s) +genrange(STR *s, int was_octal) { - int stopval; + int stopval, octal; char *savestart; + int n, cnt, *p; size_t clen; wchar_t wc; + octal = 0; savestart = s->str; if (*++s->str == '\\') - stopval = backslash(s, NULL); + stopval = backslash(s, &octal); else { clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); if (clen == (size_t)-1 || clen == (size_t)-2) @@ -254,13 +256,37 @@ genrange(STR *s) stopval = wc; s->str += clen; } - if (stopval < s->lastch) { + /* + * XXX Characters are not ordered according to collating sequence in + * multibyte locales. + */ + if (octal || was_octal || MB_CUR_MAX > 1) { + if (stopval < s->lastch) { + s->str = savestart; + return (0); + } + s->cnt = stopval - s->lastch + 1; + s->state = RANGE; + --s->lastch; + return (1); + } + if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) { s->str = savestart; return (0); } - s->cnt = stopval - s->lastch + 1; - s->state = RANGE; - --s->lastch; + if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL) + err(1, "genrange() malloc"); + for (cnt = 0; cnt < NCHARS_SB; cnt++) + if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 && + charcoll((const void *)&cnt, (const void *)&stopval) <= 0) + *p++ = cnt; + *p = OOBCH; + n = p - s->set; + + s->cnt = 0; + s->state = SET; + if (n > 1) + mergesort(s->set, n, sizeof(*(s->set)), charcoll); return (1); } diff --git a/usr.bin/tr/tr.1 b/usr.bin/tr/tr.1 index e31f4fb0e57a..37e68f3e7ca1 100644 --- a/usr.bin/tr/tr.1 +++ b/usr.bin/tr/tr.1 @@ -164,6 +164,14 @@ as defined by the collation sequence. If either or both of the range endpoints are octal sequences, it represents the range of specific coded values between the range endpoints, inclusive. +.Pp +.Bf Em +See the +.Sx COMPATIBILITY +section below for an important note regarding +differences in the way the current +implementation interprets range expressions differently from +previous implementations. .Ef .It [:class:] Represents all characters belonging to the defined character class. @@ -299,16 +307,22 @@ Remove diacritical marks from all accented variants of the letter .Pp .Dl "tr \*q[=e=]\*q \*qe\*q" .Sh COMPATIBILITY +Previous .Fx implementations of .Nm did not order characters in range expressions according to the current -locale's collation order, making it possible to convert accented Latin -characters from upper to lower case using +locale's collation order, making it possible to convert unaccented Latin +characters (esp.\& as found in English text) from upper to lower case using the traditional .Ux idiom of .Dq Li "tr A-Z a-z" . +Since +.Nm +now obeys the locale's collation order, this idiom may not produce +correct results when there is not a 1:1 mapping between lower and +upper case, or when the order of characters within the two cases differs. As noted in the .Sx EXAMPLES section above, the character class expressions @@ -320,9 +334,6 @@ should be used instead of explicit character ranges like and .Dq Li A-Z . .Pp -.Dq Li [=equiv=] -expression is implemented for single byte locales only. -.Pp System V has historically implemented character ranges using the syntax .Dq Li [c-c] instead of the diff --git a/usr.bin/tr/tr.c b/usr.bin/tr/tr.c index 033b75cb5300..6eea2cb60409 100644 --- a/usr.bin/tr/tr.c +++ b/usr.bin/tr/tr.c @@ -68,8 +68,10 @@ static void usage(void); int main(int argc, char **argv) { + static int carray[NCHARS_SB]; struct cmap *map; struct cset *delete, *squeeze; + int n, *p; int Cflag, cflag, dflag, sflag, isstring2; wint_t ch, cnt, lastch; @@ -252,7 +254,7 @@ main(int argc, char **argv) (void)next(&s2); } endloop: - if (cflag || Cflag) { + if (cflag || (Cflag && MB_CUR_MAX > 1)) { /* * This is somewhat tricky: since the character set is * potentially huge, we need to avoid allocating a map @@ -270,11 +272,10 @@ endloop: if (Cflag && !iswrune(cnt)) continue; if (cmap_lookup(map, cnt) == OOBCH) { - if (next(&s2)) { + if (next(&s2)) cmap_add(map, cnt, s2.lastch); - if (sflag) - cset_add(squeeze, s2.lastch); - } + if (sflag) + cset_add(squeeze, s2.lastch); } else cmap_add(map, cnt, cnt); if ((s2.state == EOS || s2.state == INFINITE) && @@ -282,6 +283,30 @@ endloop: break; } cmap_default(map, s2.lastch); + } else if (Cflag) { + for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) { + if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt)) + *p++ = cnt; + else + cmap_add(map, cnt, cnt); + } + n = p - carray; + if (Cflag && n > 1) + (void)mergesort(carray, n, sizeof(*carray), charcoll); + + s2.str = argv[1]; + s2.state = NORMAL; + for (cnt = 0; cnt < n; cnt++) { + (void)next(&s2); + cmap_add(map, carray[cnt], s2.lastch); + /* + * Chars taken from s2 can be different this time + * due to lack of complex upper/lower processing, + * so fill string2 again to not miss some. + */ + if (sflag) + cset_add(squeeze, s2.lastch); + } } cset_cache(squeeze); @@ -326,6 +351,16 @@ setup(char *arg, STR *str, int cflag, int Cflag) return (cs); } +int +charcoll(const void *a, const void *b) +{ + static char sa[2], sb[2]; + + sa[0] = *(const int *)a; + sb[0] = *(const int *)b; + return (strcoll(sa, sb)); +} + static void usage(void) { |