Russ Cox | b4cae4a | 2011-06-30 10:26:22 -0400 | [diff] [blame] | 1 | // Copyright 2011 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package syntax |
| 6 | |
| 7 | import "testing" |
| 8 | |
| 9 | var simplifyTests = []struct { |
| 10 | Regexp string |
| 11 | Simple string |
| 12 | }{ |
| 13 | // Already-simple constructs |
| 14 | {`a`, `a`}, |
| 15 | {`ab`, `ab`}, |
| 16 | {`a|b`, `[a-b]`}, |
| 17 | {`ab|cd`, `ab|cd`}, |
| 18 | {`(ab)*`, `(ab)*`}, |
| 19 | {`(ab)+`, `(ab)+`}, |
| 20 | {`(ab)?`, `(ab)?`}, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 21 | {`.`, `(?s:.)`}, |
Russ Cox | b4cae4a | 2011-06-30 10:26:22 -0400 | [diff] [blame] | 22 | {`^`, `^`}, |
| 23 | {`$`, `$`}, |
| 24 | {`[ac]`, `[ac]`}, |
| 25 | {`[^ac]`, `[^ac]`}, |
| 26 | |
| 27 | // Posix character classes |
| 28 | {`[[:alnum:]]`, `[0-9A-Za-z]`}, |
| 29 | {`[[:alpha:]]`, `[A-Za-z]`}, |
| 30 | {`[[:blank:]]`, `[\t ]`}, |
| 31 | {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`}, |
| 32 | {`[[:digit:]]`, `[0-9]`}, |
| 33 | {`[[:graph:]]`, `[!-~]`}, |
| 34 | {`[[:lower:]]`, `[a-z]`}, |
| 35 | {`[[:print:]]`, `[ -~]`}, |
| 36 | {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"}, |
| 37 | {`[[:space:]]`, `[\t-\r ]`}, |
| 38 | {`[[:upper:]]`, `[A-Z]`}, |
| 39 | {`[[:xdigit:]]`, `[0-9A-Fa-f]`}, |
| 40 | |
| 41 | // Perl character classes |
| 42 | {`\d`, `[0-9]`}, |
| 43 | {`\s`, `[\t-\n\f-\r ]`}, |
| 44 | {`\w`, `[0-9A-Z_a-z]`}, |
| 45 | {`\D`, `[^0-9]`}, |
| 46 | {`\S`, `[^\t-\n\f-\r ]`}, |
| 47 | {`\W`, `[^0-9A-Z_a-z]`}, |
| 48 | {`[\d]`, `[0-9]`}, |
| 49 | {`[\s]`, `[\t-\n\f-\r ]`}, |
| 50 | {`[\w]`, `[0-9A-Z_a-z]`}, |
| 51 | {`[\D]`, `[^0-9]`}, |
| 52 | {`[\S]`, `[^\t-\n\f-\r ]`}, |
| 53 | {`[\W]`, `[^0-9A-Z_a-z]`}, |
| 54 | |
| 55 | // Posix repetitions |
| 56 | {`a{1}`, `a`}, |
| 57 | {`a{2}`, `aa`}, |
| 58 | {`a{5}`, `aaaaa`}, |
| 59 | {`a{0,1}`, `a?`}, |
| 60 | // The next three are illegible because Simplify inserts (?:) |
| 61 | // parens instead of () parens to avoid creating extra |
| 62 | // captured subexpressions. The comments show a version with fewer parens. |
| 63 | {`(a){0,2}`, `(?:(a)(a)?)?`}, // (aa?)? |
| 64 | {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // (a(a(aa?)?)?)? |
| 65 | {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)? |
| 66 | {`a{0,2}`, `(?:aa?)?`}, // (aa?)? |
| 67 | {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`}, // (a(a(aa?)?)?)? |
| 68 | {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`}, // aa(a(a(aa?)?)?)? |
| 69 | {`a{0,}`, `a*`}, |
| 70 | {`a{1,}`, `a+`}, |
| 71 | {`a{2,}`, `aa+`}, |
| 72 | {`a{5,}`, `aaaaa+`}, |
| 73 | |
| 74 | // Test that operators simplify their arguments. |
| 75 | {`(?:a{1,}){1,}`, `a+`}, |
| 76 | {`(a{1,}b{1,})`, `(a+b+)`}, |
| 77 | {`a{1,}|b{1,}`, `a+|b+`}, |
| 78 | {`(?:a{1,})*`, `(?:a+)*`}, |
| 79 | {`(?:a{1,})+`, `a+`}, |
| 80 | {`(?:a{1,})?`, `(?:a+)?`}, |
| 81 | {``, `(?:)`}, |
| 82 | {`a{0}`, `(?:)`}, |
| 83 | |
| 84 | // Character class simplification |
| 85 | {`[ab]`, `[a-b]`}, |
| 86 | {`[a-za-za-z]`, `[a-z]`}, |
| 87 | {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, |
| 88 | {`[ABCDEFGH]`, `[A-H]`}, |
| 89 | {`[AB-CD-EF-GH]`, `[A-H]`}, |
| 90 | {`[W-ZP-XE-R]`, `[E-Z]`}, |
| 91 | {`[a-ee-gg-m]`, `[a-m]`}, |
| 92 | {`[a-ea-ha-m]`, `[a-m]`}, |
| 93 | {`[a-ma-ha-e]`, `[a-m]`}, |
| 94 | {`[a-zA-Z0-9 -~]`, `[ -~]`}, |
| 95 | |
| 96 | // Empty character classes |
| 97 | {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`}, |
| 98 | |
| 99 | // Full character classes |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 100 | {`[[:cntrl:][:^cntrl:]]`, `(?s:.)`}, |
Russ Cox | b4cae4a | 2011-06-30 10:26:22 -0400 | [diff] [blame] | 101 | |
| 102 | // Unicode case folding. |
| 103 | {`(?i)A`, `(?i:A)`}, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 104 | {`(?i)a`, `(?i:A)`}, |
Russ Cox | b4cae4a | 2011-06-30 10:26:22 -0400 | [diff] [blame] | 105 | {`(?i)[A]`, `(?i:A)`}, |
| 106 | {`(?i)[a]`, `(?i:A)`}, |
| 107 | {`(?i)K`, `(?i:K)`}, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 108 | {`(?i)k`, `(?i:K)`}, |
| 109 | {`(?i)\x{212a}`, "(?i:K)"}, |
Russ Cox | b4cae4a | 2011-06-30 10:26:22 -0400 | [diff] [blame] | 110 | {`(?i)[K]`, "[Kk\u212A]"}, |
| 111 | {`(?i)[k]`, "[Kk\u212A]"}, |
| 112 | {`(?i)[\x{212a}]`, "[Kk\u212A]"}, |
| 113 | {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"}, |
| 114 | {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"}, |
Russ Cox | 177dca7 | 2011-09-08 14:18:02 -0400 | [diff] [blame] | 115 | {`(?i)[\x00-\x{10FFFF}]`, `(?s:.)`}, |
Russ Cox | b4cae4a | 2011-06-30 10:26:22 -0400 | [diff] [blame] | 116 | |
| 117 | // Empty string as a regular expression. |
| 118 | // The empty string must be preserved inside parens in order |
| 119 | // to make submatches work right, so these tests are less |
| 120 | // interesting than they might otherwise be. String inserts |
| 121 | // explicit (?:) in place of non-parenthesized empty strings, |
| 122 | // to make them easier to spot for other parsers. |
| 123 | {`(a|b|)`, `([a-b]|(?:))`}, |
| 124 | {`(|)`, `()`}, |
| 125 | {`a()`, `a()`}, |
| 126 | {`(()|())`, `(()|())`}, |
| 127 | {`(a|)`, `(a|(?:))`}, |
| 128 | {`ab()cd()`, `ab()cd()`}, |
| 129 | {`()`, `()`}, |
| 130 | {`()*`, `()*`}, |
| 131 | {`()+`, `()+`}, |
| 132 | {`()?`, `()?`}, |
| 133 | {`(){0}`, `(?:)`}, |
| 134 | {`(){1}`, `()`}, |
| 135 | {`(){1,}`, `()+`}, |
| 136 | {`(){0,2}`, `(?:()()?)?`}, |
| 137 | } |
| 138 | |
| 139 | func TestSimplify(t *testing.T) { |
| 140 | for _, tt := range simplifyTests { |
| 141 | re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine) |
| 142 | if err != nil { |
| 143 | t.Errorf("Parse(%#q) = error %v", tt.Regexp, err) |
| 144 | continue |
| 145 | } |
| 146 | s := re.Simplify().String() |
| 147 | if s != tt.Simple { |
| 148 | t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple) |
| 149 | } |
| 150 | } |
| 151 | } |