2012-07-09 48 views
1

MongoDB的新手問題:在「日誌」收集使用MapReduce的生成HTTP流

我有很多HTTP日誌的存儲與數據結構如下集合:

{ 
    'client': { 
     'ip_address': '1.2.3.4', 
     'referrer':"http://....", 
     'user_agent':'Mozilla..." 
    }, 
    'request':{ 
     "stream": "stream1", 
     "method": "GET", 
     "fragment_id": 97, 
     "date": 13482181, 
    'response':{ 
     'status':200, 
     'size': 654 
    } 
} 

的每個文檔介紹一個HTTP請求(從客戶端到內容流)。由於每個流分割成更小的碎片,我想用我的收藏「MapReduce的」,然後創建一個「通用流請求」的文件,如下圖所示:

{ 
    'client_ip': '1.2.3.4', 
    'user_agent': 'Mozilla', 
    'streams':[ 
     { 
     'stream':"stream1", 
     'referrer':'http://...', 
     'requests':[ 
      { 
      'fragment_id':97, 
      'status':200, 
      'date': 13482181, 
      'size': 654 
      ... 
      }, 
      { 
      'fragment_id':98, 
      'status':200, 
      'date': 13482192, 
      'size': 624 
      ... 
      }, [...] 
     ] 
     }, [...] 
    ] 

這裏是我的嘗試:

map = function(){ 
    emit({client_ip:this.client.ip,user_agent:this.client.user_agent},{ 
       stream:this.request.stream, 
       referrer:this.client.referer, 
       status:this.response.status, 
       date:this.request.date, 
       size:this.response.total_size, 
       fragment_id:this.request.fragment_infos[1] 
    }); 
} 

reduce = function(key,values){ 
    r = {'count':0,'request':[]}; 
    values.forEach(function(v){ 
     r.count += 1; 
     r.request.push(v); 
    }); 

    return r; 
} 

但這裏是我得到的結果:

"_id" : { 
    "client_ip" : "1.2.3.4", 
    "user_agent" : "Mozilla\/4.0" 
}, 
"value" : { 
    "client_ip" : "1.2.3.4", 
    "user_agent" : "Mozilla\/4.0", 
    "count" : 17, 
    "request" : { 
     "0" : { 
      "client_ip" : "1.2.3.4", 
      "user_agent" : "Mozilla\/4.0", 
      "count" : 2, 
      "request" : { 
       "0" : { 
        "stream" : "stream1.isml", 
        "referrer" : null, 
        "status" : 200, 
        "date" : 1341706566, 
        "size" : 456, 
        "fragment_id" : null, 
        "count" : 1 
       }, 
       "1" : { 
        "stream" : "stream1.isml", 
        "referrer" : null, 
        "status" : 200, 
        "date" : 1341706566, 
        "size" : null, 
        "fragment_id" : null, 
        "count" : 1 
       } 
      } 
     }, 
     "1" : { 
      "client_ip" : "1.2.3.4", 
      "user_agent" : "Mozilla\/4.0", 
      "count" : 3, 
      "request" : { 
       "0" : { 
        "client_ip" : "1.2.3.4", 
        "user_agent" : "Mozilla\/4.0", 
        "count" : 2, 
        "request" : { 
         "0" : { 
          "stream" : "stream1.isml", 
          "referrer" : null, 
          "status" : 200, 
          "date" : 1341706568, 
          "size" : null, 
          "fragment_id" : null, 
          "count" : 1 
......... 

我在哪裏錯了?

回答

1

你將永遠與包含_id和值的記錄結束了,這是MongoDB的地圖的屬性/減少。有一個打開的門票可以改變這種行爲: https://jira.mongodb.org/browse/SERVER-2517

就你的例子來說,你想讓map函數的輸出與你想要的輸出形式一致減少功能。

map = function(){ 

    emit({client_ip:this.client.ip,user_agent:this.client.user_agent},{ 
    client_ip: this.client.ip, 
    user_agent: this.client.user_agent, 
    streams: { 
     this.request.stream: { 
     referrer: this.client.referer, 
     requests: [ 
      { 
      fragment_id: this.request.fragment_infos[1], 
      status:this.response.status, 
      date:this.request.date, 
      size:this.response.total_size 
      } 
     ] 
     } 
    } 
    }); 
} 

您需要修改您的reduce函數以合併此表單的多個文檔。如有必要,編寫一個finalize函數將流的散列轉換爲每個元素內具有流名稱的流數組。