summaryrefslogtreecommitdiff
path: root/usr.bin/tr
diff options
context:
space:
mode:
authorAndrey A. Chernov <ache@FreeBSD.org>2016-07-14 09:19:53 +0000
committerAndrey A. Chernov <ache@FreeBSD.org>2016-07-14 09:19:53 +0000
commit5b4fa425ba608143d8cf2d1201dbd7acae1ab23e (patch)
tree58196d01acff3c29cb50b5981667bad247987b6f /usr.bin/tr
parent12eae8c8f346cb459a388259ca98faebdac47038 (diff)
downloadsrc-test2-5b4fa425ba608143d8cf2d1201dbd7acae1ab23e.tar.gz
src-test2-5b4fa425ba608143d8cf2d1201dbd7acae1ab23e.zip
Back out non-collating [a-z] ranges (r302594).
Instead of changing the whole course to another POSIX-permitted way for consistency and uniformity I decide to completely ignore missing regex fucntionality and focus on fixing bugs in what we have now, too many small obstacles we have choicing other way, counting ports. Corresponding libc changes are backed out in r302824.
Notes
Notes: svn path=/head/; revision=302825
Diffstat (limited to 'usr.bin/tr')
-rw-r--r--usr.bin/tr/str.c44
-rw-r--r--usr.bin/tr/tr.121
-rw-r--r--usr.bin/tr/tr.c45
3 files changed, 91 insertions, 19 deletions
diff --git a/usr.bin/tr/str.c b/usr.bin/tr/str.c
index 0f686c4d1e12..333ca5c49831 100644
--- a/usr.bin/tr/str.c
+++ b/usr.bin/tr/str.c
@@ -53,7 +53,7 @@ static int backslash(STR *, int *);
static int bracket(STR *);
static void genclass(STR *);
static void genequiv(STR *);
-static int genrange(STR *);
+static int genrange(STR *, int);
static void genseq(STR *);
wint_t
@@ -93,7 +93,7 @@ next(STR *s)
}
/* We can start a range at any time. */
- if (s->str[0] == '-' && genrange(s))
+ if (s->str[0] == '-' && genrange(s, is_octal))
return (next(s));
return (1);
case RANGE:
@@ -237,16 +237,18 @@ genequiv(STR *s)
}
static int
-genrange(STR *s)
+genrange(STR *s, int was_octal)
{
- int stopval;
+ int stopval, octal;
char *savestart;
+ int n, cnt, *p;
size_t clen;
wchar_t wc;
+ octal = 0;
savestart = s->str;
if (*++s->str == '\\')
- stopval = backslash(s, NULL);
+ stopval = backslash(s, &octal);
else {
clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
if (clen == (size_t)-1 || clen == (size_t)-2)
@@ -254,13 +256,37 @@ genrange(STR *s)
stopval = wc;
s->str += clen;
}
- if (stopval < s->lastch) {
+ /*
+ * XXX Characters are not ordered according to collating sequence in
+ * multibyte locales.
+ */
+ if (octal || was_octal || MB_CUR_MAX > 1) {
+ if (stopval < s->lastch) {
+ s->str = savestart;
+ return (0);
+ }
+ s->cnt = stopval - s->lastch + 1;
+ s->state = RANGE;
+ --s->lastch;
+ return (1);
+ }
+ if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
s->str = savestart;
return (0);
}
- s->cnt = stopval - s->lastch + 1;
- s->state = RANGE;
- --s->lastch;
+ if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
+ err(1, "genrange() malloc");
+ for (cnt = 0; cnt < NCHARS_SB; cnt++)
+ if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
+ charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
+ *p++ = cnt;
+ *p = OOBCH;
+ n = p - s->set;
+
+ s->cnt = 0;
+ s->state = SET;
+ if (n > 1)
+ mergesort(s->set, n, sizeof(*(s->set)), charcoll);
return (1);
}
diff --git a/usr.bin/tr/tr.1 b/usr.bin/tr/tr.1
index e31f4fb0e57a..37e68f3e7ca1 100644
--- a/usr.bin/tr/tr.1
+++ b/usr.bin/tr/tr.1
@@ -164,6 +164,14 @@ as defined by the collation sequence.
If either or both of the range endpoints are octal sequences, it
represents the range of specific coded values between the
range endpoints, inclusive.
+.Pp
+.Bf Em
+See the
+.Sx COMPATIBILITY
+section below for an important note regarding
+differences in the way the current
+implementation interprets range expressions differently from
+previous implementations.
.Ef
.It [:class:]
Represents all characters belonging to the defined character class.
@@ -299,16 +307,22 @@ Remove diacritical marks from all accented variants of the letter
.Pp
.Dl "tr \*q[=e=]\*q \*qe\*q"
.Sh COMPATIBILITY
+Previous
.Fx
implementations of
.Nm
did not order characters in range expressions according to the current
-locale's collation order, making it possible to convert accented Latin
-characters from upper to lower case using
+locale's collation order, making it possible to convert unaccented Latin
+characters (esp.\& as found in English text) from upper to lower case using
the traditional
.Ux
idiom of
.Dq Li "tr A-Z a-z" .
+Since
+.Nm
+now obeys the locale's collation order, this idiom may not produce
+correct results when there is not a 1:1 mapping between lower and
+upper case, or when the order of characters within the two cases differs.
As noted in the
.Sx EXAMPLES
section above, the character class expressions
@@ -320,9 +334,6 @@ should be used instead of explicit character ranges like
and
.Dq Li A-Z .
.Pp
-.Dq Li [=equiv=]
-expression is implemented for single byte locales only.
-.Pp
System V has historically implemented character ranges using the syntax
.Dq Li [c-c]
instead of the
diff --git a/usr.bin/tr/tr.c b/usr.bin/tr/tr.c
index 033b75cb5300..6eea2cb60409 100644
--- a/usr.bin/tr/tr.c
+++ b/usr.bin/tr/tr.c
@@ -68,8 +68,10 @@ static void usage(void);
int
main(int argc, char **argv)
{
+ static int carray[NCHARS_SB];
struct cmap *map;
struct cset *delete, *squeeze;
+ int n, *p;
int Cflag, cflag, dflag, sflag, isstring2;
wint_t ch, cnt, lastch;
@@ -252,7 +254,7 @@ main(int argc, char **argv)
(void)next(&s2);
}
endloop:
- if (cflag || Cflag) {
+ if (cflag || (Cflag && MB_CUR_MAX > 1)) {
/*
* This is somewhat tricky: since the character set is
* potentially huge, we need to avoid allocating a map
@@ -270,11 +272,10 @@ endloop:
if (Cflag && !iswrune(cnt))
continue;
if (cmap_lookup(map, cnt) == OOBCH) {
- if (next(&s2)) {
+ if (next(&s2))
cmap_add(map, cnt, s2.lastch);
- if (sflag)
- cset_add(squeeze, s2.lastch);
- }
+ if (sflag)
+ cset_add(squeeze, s2.lastch);
} else
cmap_add(map, cnt, cnt);
if ((s2.state == EOS || s2.state == INFINITE) &&
@@ -282,6 +283,30 @@ endloop:
break;
}
cmap_default(map, s2.lastch);
+ } else if (Cflag) {
+ for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) {
+ if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt))
+ *p++ = cnt;
+ else
+ cmap_add(map, cnt, cnt);
+ }
+ n = p - carray;
+ if (Cflag && n > 1)
+ (void)mergesort(carray, n, sizeof(*carray), charcoll);
+
+ s2.str = argv[1];
+ s2.state = NORMAL;
+ for (cnt = 0; cnt < n; cnt++) {
+ (void)next(&s2);
+ cmap_add(map, carray[cnt], s2.lastch);
+ /*
+ * Chars taken from s2 can be different this time
+ * due to lack of complex upper/lower processing,
+ * so fill string2 again to not miss some.
+ */
+ if (sflag)
+ cset_add(squeeze, s2.lastch);
+ }
}
cset_cache(squeeze);
@@ -326,6 +351,16 @@ setup(char *arg, STR *str, int cflag, int Cflag)
return (cs);
}
+int
+charcoll(const void *a, const void *b)
+{
+ static char sa[2], sb[2];
+
+ sa[0] = *(const int *)a;
+ sb[0] = *(const int *)b;
+ return (strcoll(sa, sb));
+}
+
static void
usage(void)
{