2012-04-10 89 views
2

最近我正在處理nginx與python分析工作的訪問日誌。更有效的方式通過python空間拆分引用的字符串?

我發現根據this

使用shlex分裂的空間引號的字符串的方式,但它真的很慢,分析2000行日誌的成本超過1.2秒。我的nginx服務器每秒產生超過2500行。

所以我用re或更多本地(和粗魯)的方式嘗試索引字符串。

的代碼在虛擬機和大約0.5秒以上兩種費用2000行日誌

我是否有任何其他的選擇,使其更有效地運行?

在此先感謝

這裏是我的代碼

import re 
import time 
import datetime 
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080 [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"' 
def convert(line): 
    line = re.split('\"', line) 
    line_pre = re.split('\s+', line[0]) 

    r =re.compile(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$") 
    http_method =r.findall(line[1]) 
    #http_method =re.findall(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$", line[1]) 
    if len(http_method): 
     http_method = http_method[0] 
    else: 
     http_method = '' 
    r = re.compile(r"^\s+(\d{1,3})\s+(\d+)") 
    code_byte = r.findall(line[2]) 
    #code_byte = re.findall(r"^\s+(\d{1,3})\s+(\d+)", line[2]) 
    status = int(code_byte[0][0]) 
    bytes_sent = int(code_byte[0][1]) 
    r = re.compile(r":\d+$") 
    upstream_addr = r.sub("", line_pre[4]) 
    request_time = int(float(line_pre[0])*1000) 
    if line_pre[1] == '-': 
     upstream_response_time = -1 
    else: 
     upstream_response_time = int(float(line_pre[1])*1000) 
    remote_addr = line_pre[2] 
    host = line_pre[7].replace(' ','') 
    logdatetime = line_pre[5].replace('[','') 
    dt = datetime.datetime.strptime(logdatetime, "%d/%b/%Y:%H:%M:%S") 
    year = int(str(dt)[0:4]) 
    monthday = int(str(dt)[4:10].replace("-","")) 
    hour = int(str(dt)[11:13]) 
    logtime = int(str(dt)[14:16]) 
    sec = time.mktime(dt.timetuple()) 
    r = re.compile(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$") 
    request_uri = r.findall(line[1]) 
    #request_uri = re.findall(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$", line[1]) 
    http_referer = line[3] 
    user_agent = line[5] 
    gzip_ratio = line[7] 
    http_x_forwarded_for = line[9] 
    r = re.compile(r"^([0-9\.]+)\s+(.*)") 
    serad_guid = r.findall(line[11]) 
    server_addr = serad_guid[0][0] 
    guid = serad_guid[0][1] 
    doc = { 
        "hour":hour, 
        "year":year, 
        "date":monthday, 
        "time":logtime, 
        "sec":sec, 
        "request_time":request_time, 
        "upstream_response_time":upstream_response_time, 
        "remote_addr":remote_addr, 
        "upstream_addr":upstream_addr, 
        "host":host, 
        "method":http_method, 
        "request_uri":request_uri, 
        #"request_protocal":"", 
        "status":status, 
        "bytes_sent":bytes_sent, 
        "http_referer":http_referer, 
        "user_agent":user_agent, 
        "gzip_ratio":gzip_ratio, 
        "http_x_forwarded_for":http_x_forwarded_for, 
        "server_addr":server_addr, 
        "guid":guid 

    } 
    return doc 
t2 = time.time() 
count =0 
for i in range(12000): 
    convert(line) 
    count += 1 
    if count % 2000 == 0: 
    t1 = t2 
     t2 = time.time() 
     print str(t2-t1) 

指數方式

import time 
import datetime 
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080 [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"' 

def pair(l): 
    for i in range(0, len(l), 2): 
     yield (l[i], l[i+1]) 

def convert(line): 
    line = line.replace(" ", "") 
    quotes_positions = allindices(line, "\"") 
    if len(quotes_positions) <= 0 or len(quotes_positions)% 2 != 0: 
     return None 

    space_positions = allindices(line, " ") 

    target_positions = [] 

    for s in space_positions: 
     true_target = True 
     for qs, qe in pair(quotes_positions): 
      if s > qs and s < qe: 
       true_target = False 
       break 
     if true_target: 
      target_positions.append(s) 

    ret = [] 
    for i in range(0, len(target_positions)): 
     if i + 1 == len(target_positions): 
      ret.append(line[target_positions[i] + 1:]) 
     else: 
      ret.append(line[target_positions[i] + 1:target_positions[i + 1]]) 
    return ret 


# def allindices(string, sub, listindex=[], offset=0): 
def allindices(string, sub): 
    listindex = list() 
    i = string.find(sub) 
    while i >= 0: 
     listindex.append(i) 
     i = string.find(sub, i + 1) 
    return listindex 

t2 = time.time() 
count =0 
for i in range(12000): 
    convert(line) 
    count += 1 
    if count % 2000 == 0: 
    t1 = t2 
     t2 = time.time() 
     print str(t2-t1) 
+0

這是一個非常多的代碼。我不會坐在這裏,弄清楚它應該做的所有事情。你能描述確切的解析,和/或顯示樣本輸入和相應的輸出嗎? – 2012-04-10 11:53:47

+0

也許你會這樣做有點不對?考慮更改服務器上的['log_format'](http://wiki.nginx.org/HttpLogModule)選項以適合您的解析器;也許讓它看起來像json,或者添加一個不規則的分隔符(比如'|')而不是空白。 – SingleNegationElimination 2012-04-10 12:28:51

回答

3

只是寫了一個基於對樣本行正則表達式,我真的不知道一些字段的含義,所以我用他們佔位符名稱,您可以重命名他們更有意義的。在我的機器上,這個片段比第一個片段快4到5倍。

log_line_re = re.compile(
r""" 
(?P<float1>[0-9.]+) 
\s 
(?P<float2>[0-9.]+) 
\s 
(?P<ip1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) 
\s 
(?P<field1>.+?) 
\s 
(?P<ip_port_1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}) 
\s+ 
\[(?P<request_date>.+?)\] 
\s 
(?P<host>.+?) 
\s 
" 
(?P<http_method>[A-Z]+) 
\s 
(?P<request_path>.+?) 
\s 
HTTP/(?P<http_version>[0-9.]+) 
" 
\s 
(?P<status_code>\d{3}) 
\s 
(?P<number>\d+) 
\s 
" 
(?P<referer>.+?) 
" 
\s 
"(?P<user_agent>.+?)" 
\s 
"(?P<field2>.+?)" 
\s 
"(?P<field3>.+?)" 
\s 
(?P<field4>.+?) 
" 
(?P<ip2>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) 
\s 
(?P<request_guid>.+?) 
" 
""", re.VERBOSE) 


def convert(line): 
    return log_line_re.match(line).groupdict() 
+0

是的,我怎麼能忘記使用單一的答案:D – AleiPhoenix 2012-04-10 13:56:58

3

這看起來有點像CSV;我想知道csv模塊是否可以被濫用來處理這個問題?

>>> for row in csv.reader([line], delimiter=' '): 
...  print repr(row) 
... 
['0.278', '0.264', '113.116.52.174', '-', '10.10.3.41:20080', '', '[08/Apr/2012:23:59:08', '+0800]', 'shenzhen.anjuke.com', 'GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0', '200', '10914', 'http://shenzhen.anjuke.com/prop/view/104178677', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)', '-', '-', '-', '114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E']