2013-05-10 87 views
0

試圖通過使用遞歸的文本定義邊界標記來分割文本,並創建包含原始文本文件的所有組織部分的列表和字符串列表。遞歸文本分割的問題

拆分沒有發生。

這裏是短版:真正的問題腳本:

def separate(text,boundary = None): 
    if boundary == None: 
     m = re.findall(r'(?<=boundary=).*',text) 
     i = 0 
     while i < len(m): #have all levels of Boundary/headers named 
      boundary = m[i] 
      textList = recursiveSplit(text,boundary) 
      i += 1 
     pdb.set_trace() 
     return textList 

def recursiveSplit(chunk,boundary): 
    if type(chunk) is types.StringType: 
     ar = re.split(r'(?P<boundary>)(?!--)',chunk) 
     return ar 
    if type(chunk) is types.ListType: 
     i = 0 
     while i < len(chunk): 
      chunk[i] = recursiveSplit(chunk[i],boundary) 
      i += 1 
     return obj 

我之前已經發布了這個劇本,人們希望我將它張貼在其全部,所以我會做

#Textbasics email parser 
#based on a "show original" file converted into text 

from sys import argv 
import re, os, pdb, types 

script, filename = argv 
text = open(filename).read() 
type = "text only" #Set the default type of email 

#cut the email up by sections 
#--A section is defined as any time there are two line breaks in a row 
textList = re.split(r"\n\n", text) 
header = textList[0] 
if re.search(r'MIME-Version',header): 
    type = "MIME" 

# If mail has no attachments, parse as a text-only email 
class Parser(object): 

    def __init__(self,textList): 
     a = 1 
     self.body = "" 
     self.textList = textList 
     self.header = textList[0] 
     while a < len(textList): 
      self.body = self.body + textList[a] + '\n\n' 
      a += 1 

     m = re.search(r'(?<=Subject:).*', self.header) 
     self.subject = m.group(0) 

     m = re.search(r'(?<=From:).*', self.header) 
     self.fromVar = m.group(0) 

     m = re.search(r'(?<=To:).*', self.header) 
     self.toVar = m.group(0) 

     m = re.search(r'(?<=Date:)\w+\s\w+\s\w+', self.header) 
     self.date = m.group(0) 

    def returnParsed(self,descriptor = "all"): 
     if descriptor == "all": 
      retv = "Subject: " + self.subject + "\n" + "From: " + self.fromVar + "\n" + "To: " + self.toVar + "\n" + "Date: " + self.date + "\n" + "\n" + self.body 
      return retv 

     if descriptor == "subject": 
      return self.subject 
     if descriptor == "fromVar": 
      return self.fromVar 
     if descriptor == "toVar": 
      return self.toVar 
     if descriptor == "date": 
      return self.date 
     if descriptor == "body": 
      return self.body 

class MIMEParser(Parser): 

    class MIMEDataDecoder(object): 
     def __init__(self,decodeString,type): 
      pass  


    def __init__(self,textList): 
     self.textList = textList 
     self.nestedItems = [] 
     newItem = NestedItem(self) 
     newItem.setContentType("Header") 
     newItem.setValue(self.textList[0]) 
     self.nestedItems.append(newItem) 
     if re.search(r'(boundary=)',newItem.value): 
      helperItem = NestedItem(self) 
      helperItem.value = (self.textList[0]) 
      m = re.search(r'(?<=Content-Type:).+(?=;)',newItem.value) 
      helperItem.setContentType(m.group(0)) 
      self.nestedItems.append(helperItem) 

     self.organizeData() 
     """i = 0 
     while i < len(self.textList): 
      newItem = NestedItem(self) 
      ct = self.nextContentType 
      newItem.setContentType(ct) 
      newItem.setValue(self.textList[i]) 
      self.nestedItems.append(newItem) 
      m = re.search(r'(?<=Content-Type:).+(?=;)',self.textList[i]) 
      if m: 
       self.nextContentType = m.group(0) 
      i += 1 
      """ 

    def nestItem (self,item): 
     self.nestedItems.append(item) 

    def organizeData(self): 
     self.nestLevel = 1 
     self.currentSuper = self 
     m = re.search(r'(?<=boundary=).*',self.textList[0]) 
     self.currentBoundary = m.group(0) 
     self.currentList = self.textList 
     self.currentList.remove(self.textList[0]) 
     self.formerObjectDatabase = {} 
     pdb.set_trace() 
     while self.nestLevel > 0: 
      i = 0 
      while i < len(self.currentList): 

       boundary = self.currentBoundary 
       #If block is a "normal block", containing a current boundary identifier 
       p = re.search(r'--(?P<boundary>)(?!--)', text) 
       if p: 
        newItem = NestedItem(self.currentSuper) 
        newItem.setValue(self.currentList[i]) 
        r = re.search(r'(?<=Content-Type:).+(?=;)',newItem.value) 
        if r: 
         newItem.setContentType(r.group(0)) 
        self.currentObject = newItem 
        self.currentSuper.nestItem(self.currentObject) 
       #If the block contains a new block boundary 
       m = re.search(r'(?<=boundary=).*',self.currentList[i]) 
       if m: 
        #begin new layer of recursive commands 
        newFormerObject = self.FormerCurrentObject(self.currentList,self.currentSuper,self.currentBoundary) 
        self.formerObjectDatabase[self.nestLevel] = newFormerObject 
        self.currentSuper = self.currentObject 
        self.nestLevel += 1 
        self.currentBoundary = m.group(0) 
        boundary = self.currentBoundary 
        #self.currentList = re.split(r'--(?P<boundary>)(?!--)', self.currentList[i]) 
       boundary = self.currentBoundary 
       #If block contains an "end of boundary" marker 
       q = re.search(r'(?P<boundary>)--', text) 
       if q: 
        self.nestLevel -= 1 
        currentObject = self.formerObjectDatabase[self.nestLevel] 
        self.currentList = currentObject.formerList 
        self.currentSuper = currentObject.formerSuper 
        self.currentBoundary = currentObject.formerBoundary 
       i += 1      


    class FormerCurrentObject: 
     def __init__(self,formerList,formerSuper,formerBoundary): 
      self.formerList = formerList 
      self.formerSuper = formerSuper 
      self.formerBoundary = formerBoundary 




    def printAll(self): 
     print "printing all: %d" % len(self.nestedItems) 
     i = 0 
     while i < len(self.nestedItems): 
      print "printing out item %d" % i 
      self.nestedItems[i].printOut() 
      i += 1 

class NestedItem(object): 
    def __init__(self,superObject,contentType=" ",value = " "): 
     self.superObject = superObject 
     self.contentType = contentType 
     self.value = value 
     self.nestedItems = [] 

    def nestItem(self,item): 
     self.nestedItems.append(item) 

    def printOut(self,printBuffer = ""): 
     print printBuffer + '++%s' % self.contentType 
     print printBuffer + self.value 
     a = 0 
     printBuffer = printBuffer + " " 
     while a < len(self.nestedItems): 
      self.nestedItems[a].printOut(printBuffer) 

    def setContentType(self,contentType): 
     self.contentType = contentType 

    def setValue(self,value): 
     self.value = value 



if type == "text only": 
    p = Parser(textList) 
    print p.returnParsed() 
# ---PROBLEM CODE STARTS HERE--- 
def separate(text,boundary = None): 
    pdb.set_trace() 
    if boundary == None: 
     m = re.findall(r'(?<=boundary=).*',text) 
     i = 0 
     textList = [text] 
     while i < len(m): #have all levels of Boundary/headers named 
      boundary = m[i] 
      textList = recursiveSplit(textList,boundary) 
      i += 1 

    return textList 

def recursiveSplit(chunk,boundary): 
    if type(chunk) is types.ListType: #<<--error occurs here 
     for obj in chunk: 
      recursiveSplit(obj,boundary) 
    if type(chunk) is types.StringType: 
     list = re.split(r'(?P<boundary>)(?!--)',chunk) 
     return list 
    return None 
#---PROBLEM CODE ENDS(?) HERE--- 

if type == "MIME": 
    #separate the text file instead by its boundary identifier 
    p = MIMEParser(separate(text)) 
    p.printAll() 

您可以使用任何MIME類型的電子郵件來運行。這是我一直在使用的方便

MIME-Version: 1.0 
Received: by 10.112.170.40 with HTTP; Fri, 3 May 2013 05:08:21 -0700 (PDT) 
Date: Fri, 3 May 2013 08:08:21 -0400 
Delivered-To: [email protected] 
Message-ID: <@mail.gmail.com> 
Subject: MiB 5/3/13 7:43AM (EST) 
From: ME<[email protected]> 
To: SOMEONE <[email protected]> 
Content-Type: multipart/mixed; boundary=BNDRY1 

--BNDRY1 
Content-Type: multipart/alternative; boundary=BNDRY2 

--BNDRY2 
Content-Type: text/plain; charset=ISO-8859-1 

-changed signature methods to conform more to working clinic header 
methods(please test/not testable in simulator) 
-confirmed that signature image is showing up in simulator. Awaiting 
further tests 
-Modified findings spacing/buffer. See if you like it 

--BNDRY2 
Content-Type: text/html; charset=ISO-8859-1 

<div dir="ltr">-changed signature methods to conform more to working clinic header methods(please test/not testable in simulator)<div style>-confirmed that signature image is showing up in simulator. Awaiting further tests</div> 
<div style>-Modified findings spacing/buffer. See if you like it</div></div> 

--BNDRY2-- 
--BNDRY1 
Content-Type: application/zip; name="Make it Brief.ipa.zip" 
Content-Disposition: attachment; filename="Make it Brief.ipa.zip" 
Content-Transfer-Encoding: base64 
X-Attachment-Id: f_hg9biuno0 

<<FILE DATA>> 
--BNDRY1-- 
+0

你希望得到的最終輸出是什麼? – 2013-05-10 22:02:35

+0

我希望能得到由BOUND標記分隔的列表和字符串列表。所以每個列表元素將以BNDRY(X)開始,然後僅包含該BNDRY標題下的信息 – Pinwheeler 2013-05-10 22:07:49

+0

我發現該正則表達式不正確。當我硬編碼BNDRY1它按預期工作(對於BNDRY1)。 – Pinwheeler 2013-05-10 22:37:36

回答

2

的一個問題是在正則表達式。可能有更酷的方式來做到這一點,但我只是基於變量創建了一個搜索字符串文字。

def recursiveSplit(chunk,boundary): 
    if type(chunk) is types.StringType: 
     #ar = re.split(r'(?P<boundary>)(?!--)',chunk) 
     searchString = "--%s" % boundary 
     print searchString 
     ar = re.split(searchString,chunk) 
     return ar 
    if type(chunk) is types.ListType: 
     i = 0 
     while i < len(chunk): 
      chunk[i] = recursiveSplit(chunk[i],boundary) 
      i += 1 
     return obj 
+0

+1爲自給自足。 – jpaugh 2013-05-10 22:52:52