aboutsummaryrefslogtreecommitdiff
path: root/lib/libc/tests/regex/multibyte.sh
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libc/tests/regex/multibyte.sh')
-rwxr-xr-xlib/libc/tests/regex/multibyte.sh93
1 files changed, 93 insertions, 0 deletions
diff --git a/lib/libc/tests/regex/multibyte.sh b/lib/libc/tests/regex/multibyte.sh
new file mode 100755
index 000000000000..18323f500a2b
--- /dev/null
+++ b/lib/libc/tests/regex/multibyte.sh
@@ -0,0 +1,93 @@
+atf_test_case bmpat
+bmpat_head()
+{
+ atf_set "descr" "Check matching multibyte characters (PR153502)"
+}
+bmpat_body()
+{
+ export LC_CTYPE="C.UTF-8"
+
+ printf 'é' | atf_check -o "inline:é" \
+ sed -ne '/^.$/p'
+ printf 'éé' | atf_check -o "inline:éé" \
+ sed -ne '/^..$/p'
+ printf 'aéa' | atf_check -o "inline:aéa" \
+ sed -ne '/a.a/p'
+ printf 'aéa'| atf_check -o "inline:aéa" \
+ sed -ne '/a.*a/p'
+ printf 'aaéaa' | atf_check -o "inline:aaéaa" \
+ sed -ne '/aa.aa/p'
+ printf 'aéaéa' | atf_check -o "inline:aéaéa" \
+ sed -ne '/a.a.a/p'
+ printf 'éa' | atf_check -o "inline:éa" \
+ sed -ne '/.a/p'
+ printf 'aéaa' | atf_check -o "inline:aéaa" \
+ sed -ne '/a.aa/p'
+ printf 'éaé' | atf_check -o "inline:éaé" \
+ sed -ne '/.a./p'
+}
+
+atf_test_case icase
+icase_head()
+{
+ atf_set "descr" "Check case-insensitive matching for characters 128-255"
+}
+icase_body()
+{
+ export LC_CTYPE="C.UTF-8"
+
+ a=$(printf '\302\265\n') # U+00B5
+ b=$(printf '\316\234\n') # U+039C
+ c=$(printf '\316\274\n') # U+03BC
+
+ echo $b | atf_check -o "inline:$b\n" sed -ne "/$a/Ip"
+ echo $c | atf_check -o "inline:$c\n" sed -ne "/$a/Ip"
+}
+
+atf_test_case mbset cleanup
+mbset_head()
+{
+ atf_set "descr" "Check multibyte sets matching"
+}
+mbset_body()
+{
+ export LC_CTYPE="C.UTF-8"
+
+ # This involved an erroneously implemented optimization which reduces
+ # single-element sets to an exact match with a single codepoint.
+ # Match sets record small-codepoint characters in a bitmap and
+ # large-codepoint characters in an array; the optimization would falsely
+ # trigger if either the bitmap or the array was a singleton, ignoring
+ # the members of the other side of the set.
+ #
+ # To exercise this, we construct sets which have one member of one side
+ # and one or more of the other, and verify that all members can be
+ # found.
+ printf "a" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
+ printf "a" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+ printf "á" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+ printf "a" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+ printf "b" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+ printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
+ printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+ printf "á" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+ printf "a" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+ printf "b" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+}
+mbset_cleanup()
+{
+ rm -f mbset
+}
+
+atf_init_test_cases()
+{
+ atf_add_test_case bmpat
+ atf_add_test_case icase
+ atf_add_test_case mbset
+}