弹剑而歌: python merge regular expression from file

#!/usr/bin/env python

import re

# record = open('test.txt', 'r').read()
record = '60.176.247.144 - - [09/May/2007:16:40:54 +0800] "GET /bbs/thread-19160-1-7.html HTTP/1.1" 200 22481 "http://www.google.com/search?hl=en&q=%E6%94%AF%E4%BB%98%E5%AE%9D+HAS_NO_PRIVILEGE&btnG=Google+Search" "Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.8.1.3) Gecko/20070309 (FoxPlus) Firefox/2.0.0.3"'

regexp1 = r'(?P[\d\.]+) - - (?P\[.*\]) "GET (?P.*) HTTP/[\d\.]+" \d+ \d+ "(?Phttp://.*)" ".*"'
recmo = re.match(regexp1, record)
recmo.group('rlink')

r = open('regexps.conf', 'r').read()
# content:
#    google.com, r'search.*[?&]q=(?P.*)&.*'
r = r.strip()
print r
rmo = re.match('\s*([\w\.]+)\s*,\s*r\'(.*)\'', r)
print rmo.groups()
site, regexp2 = rmo.groups()
print site, regexp2
regexp3 = r'http://[\w\.]*(?P%s)/(?P%s)' % (site, regexp2)
# Merge
print recmo.group('rlink')
print regexp3
rc = re.compile(regexp3)
mo = rc.match(recmo.group('rlink'))
print mo.groups()
print mo.group('word')

sh$ python test.py
google.com, r'search.*[?&]q=(?P.*)&.*'
('google.com', 'search.*[?&]q=(?P.*)&.*')
google.com search.*[?&]q=(?P.*)&.*
http://www.google.com/search?hl=en&q=%E6%94%AF%E4%BB%98%E5%AE%9D+HAS_NO_PRIVILEGE&btnG=Google+Search
http://[\w\.]*(?Pgoogle.com)/(?Psearch.*[?&]q=(?P.*)&.*)
('google.com', 'search?hl=en&q=%E6%94%AF%E4%BB%98%E5%AE%9D+HAS_NO_PRIVILEGE&btnG=Google+Search', '%E6%94%AF%E4%BB%98%E5%AE%9D+HAS_NO_PRIVILEGE')
%E6%94%AF%E4%BB%98%E5%AE%9D+HAS_NO_PRIVILEGE

如果改变 regexps.conf 的内容为：

google.com, r'search.*[?&]q=.*&.*'

即去除其中的分组设置，则运行为：

sh$ python test.py
google.com, r'search.*[?&]q=.*&.*'
('google.com', 'search.*[?&]q=.*&.*')
google.com search.*[?&]q=.*&.*
http://www.google.com/search?hl=en&q=%E6%94%AF%E4%BB%98%E5%AE%9D+HAS_NO_PRIVILEGE&btnG=Google+Search
http://[\w\.]*(?Pgoogle.com)/(?Psearch.*[?&]q=.*&.*)
('google.com', 'search?hl=en&q=%E6%94%AF%E4%BB%98%E5%AE%9D+HAS_NO_PRIVILEGE&btnG=Google+Search')
Traceback (most recent call last):
  File "test.py", line 26, in ?
    print mo.group('word')
IndexError: no such group

再把条件弄复杂一点，以保证总由 word 这个 group：

rmo = re.match(r'\s*([\w\.]+)\s*,\s*r\'(.*=\(\?\P\\.\*\).*)\'', r)

弹剑而歌

星期五, 五月 11, 2007

python merge regular expression from file

没有评论:

博客归档

供稿人