Rob Pike | d9a0bc9 | 2009-08-05 13:03:46 -0700 | [diff] [blame] | 1 | /* |
| 2 | Redistribution and use in source and binary forms, with or without |
| 3 | modification, are permitted provided that the following conditions are met: |
| 4 | |
| 5 | * Redistributions of source code must retain the above copyright |
| 6 | notice, this list of conditions and the following disclaimer. |
| 7 | |
| 8 | * Redistributions in binary form must reproduce the above copyright |
| 9 | notice, this list of conditions and the following disclaimer in the |
| 10 | documentation and/or other materials provided with the distribution. |
| 11 | |
| 12 | * Neither the name of "The Computer Language Benchmarks Game" nor the |
| 13 | name of "The Computer Language Shootout Benchmarks" nor the names of |
| 14 | its contributors may be used to endorse or promote products derived |
| 15 | from this software without specific prior written permission. |
| 16 | |
| 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 27 | POSSIBILITY OF SUCH DAMAGE. |
| 28 | */ |
| 29 | |
| 30 | /* |
| 31 | ** The Computer Language Shootout |
| 32 | ** http://shootout.alioth.debian.org/ |
| 33 | ** contributed by Mike Pall |
| 34 | ** |
| 35 | ** regex-dna benchmark using PCRE |
| 36 | ** |
| 37 | ** compile with: |
| 38 | ** gcc -O3 -fomit-frame-pointer -o regexdna regexdna.c -lpcre |
| 39 | */ |
| 40 | |
| 41 | #define __USE_STRING_INLINES |
| 42 | #include <stdio.h> |
| 43 | #include <string.h> |
| 44 | #include <stdlib.h> |
| 45 | #include <pcre.h> |
| 46 | |
| 47 | typedef struct fbuf { |
| 48 | char *buf; |
| 49 | size_t size, len; |
| 50 | } fbuf_t; |
| 51 | |
| 52 | static void fb_init(fbuf_t *b) |
| 53 | { |
| 54 | b->buf = NULL; |
| 55 | b->len = b->size = 0; |
| 56 | } |
| 57 | |
| 58 | static char *fb_need(fbuf_t *b, size_t need) |
| 59 | { |
| 60 | need += b->len; |
| 61 | if (need > b->size) { |
| 62 | if (b->size == 0) b->size = need; |
| 63 | else while (need > b->size) b->size += b->size; |
| 64 | if (!(b->buf = realloc(b->buf, b->size))) exit(1); |
| 65 | } |
| 66 | return b->buf+b->len; |
| 67 | } |
| 68 | |
| 69 | #define FB_MINREAD (3<<16) |
| 70 | |
| 71 | /* Read all of a stdio stream into dst buffer. */ |
| 72 | static size_t fb_readall(fbuf_t *dst, FILE *fp) |
| 73 | { |
| 74 | char *dp; |
| 75 | int n; |
| 76 | for (dp = fb_need(dst, FB_MINREAD); |
| 77 | (n = fread(dp, 1, dst->size-dst->len, fp)) > 0; |
| 78 | dp = fb_need(dst, FB_MINREAD)) dst->len += n; |
| 79 | if (ferror(fp)) exit(1); |
| 80 | return dst->len; |
| 81 | } |
| 82 | |
| 83 | /* Substitute pattern p with replacement r, copying from src to dst buffer. */ |
| 84 | static size_t fb_subst(fbuf_t *dst, fbuf_t *src, const char *p, const char *r) |
| 85 | { |
| 86 | pcre *re; |
| 87 | pcre_extra *re_ex; |
| 88 | const char *re_e; |
| 89 | char *dp; |
| 90 | int re_eo, m[3], pos, rlen, clen; |
| 91 | if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1); |
| 92 | re_ex = pcre_study(re, 0, &re_e); |
| 93 | for (dst->len = 0, rlen = strlen(r), pos = 0; |
| 94 | pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0; |
| 95 | pos = m[1]) { |
| 96 | clen = m[0]-pos; |
| 97 | dp = fb_need(dst, clen+rlen); |
| 98 | dst->len += clen+rlen; |
| 99 | memcpy(dp, src->buf+pos, clen); |
| 100 | memcpy(dp+clen, r, rlen); |
| 101 | } |
| 102 | clen = src->len-pos; |
| 103 | dp = fb_need(dst, clen); |
| 104 | dst->len += clen; |
| 105 | memcpy(dp, src->buf+pos, clen); |
| 106 | return dst->len; |
| 107 | } |
| 108 | |
| 109 | /* Count all matches with pattern p in src buffer. */ |
| 110 | static int fb_countmatches(fbuf_t *src, const char *p) |
| 111 | { |
| 112 | pcre *re; |
| 113 | pcre_extra *re_ex; |
| 114 | const char *re_e; |
| 115 | int re_eo, m[3], pos, count; |
| 116 | if (!(re = pcre_compile(p, PCRE_CASELESS, &re_e, &re_eo, NULL))) exit(1); |
| 117 | re_ex = pcre_study(re, 0, &re_e); |
| 118 | for (count = 0, pos = 0; |
| 119 | pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0; |
| 120 | pos = m[1]) count++; |
| 121 | return count; |
| 122 | } |
| 123 | |
| 124 | static const char *variants[] = { |
| 125 | "agggtaaa|tttaccct", "[cgt]gggtaaa|tttaccc[acg]", |
| 126 | "a[act]ggtaaa|tttacc[agt]t", "ag[act]gtaaa|tttac[agt]ct", |
| 127 | "agg[act]taaa|ttta[agt]cct", "aggg[acg]aaa|ttt[cgt]ccct", |
| 128 | "agggt[cgt]aa|tt[acg]accct", "agggta[cgt]a|t[acg]taccct", |
| 129 | "agggtaa[cgt]|[acg]ttaccct", NULL |
| 130 | }; |
| 131 | |
| 132 | static const char *subst[] = { |
| 133 | "B", "(c|g|t)", "D", "(a|g|t)", "H", "(a|c|t)", "K", "(g|t)", |
| 134 | "M", "(a|c)", "N", "(a|c|g|t)", "R", "(a|g)", "S", "(c|g)", |
| 135 | "V", "(a|c|g)", "W", "(a|t)", "Y", "(c|t)", NULL |
| 136 | }; |
| 137 | |
| 138 | int main(int argc, char **argv) |
| 139 | { |
| 140 | fbuf_t seq[2]; |
| 141 | const char **pp; |
| 142 | size_t ilen, clen, slen; |
| 143 | int flip; |
| 144 | fb_init(&seq[0]); |
| 145 | fb_init(&seq[1]); |
| 146 | ilen = fb_readall(&seq[0], stdin); |
| 147 | clen = fb_subst(&seq[1], &seq[0], ">.*|\n", ""); |
| 148 | for (pp = variants; *pp; pp++) |
| 149 | printf("%s %d\n", *pp, fb_countmatches(&seq[1], *pp)); |
| 150 | for (slen = 0, flip = 1, pp = subst; *pp; pp += 2, flip = 1-flip) |
| 151 | slen = fb_subst(&seq[1-flip], &seq[flip], *pp, pp[1]); |
| 152 | printf("\n%zu\n%zu\n%zu\n", ilen, clen, slen); |
| 153 | return 0; |
| 154 | } |