2016-08-23 70 views
0

我試圖用python檢索XML標記的一些信息,我的實現是爲每個情況都保存一個字典標記ID,所有子數據,但我不知道如何處理從文本節點提取數據的事實,謝謝。Python XML DOM收集元素數據

我的代碼:

from xml.dom.minidom import * 
import requests 

print("GETTING XML...") 
resp = requests.get('http://infocar.dgt.es/datex2/dgt/SituationPublication/all/content.xml', stream = True) #XML that I need 
if resp.status_code != 200: 
    raise ApiError('GET /tasks/ {}'.format(resp.status_code)) 
print("XML RECIBIDO 200 OK") 
#resp.raw.decode_content = True 
print("GUARDANDO XML") 
with open("DGT_DATEX.xml", "wb") as handle: 
    for data in (resp.iter_content()): 
     handle.write(data) 

print("XML GUARDADO") 
print("INICIANDO PARSEO..") 
dom3 = parse("DGT_DATEX.xml") 
print(dom3)#memory dir 
print("DATEX PARSEADO") 




def getText(nodelist): 

    dict = {} 
    listofdata = list() 
    for node in nodelistofPayloadTag: 
     if node.nodeType != node.TEXT_NODE: 
      dict[node.getAttribute('id')] = listofdata 
      listofdata = goDeep(node.childNodes ,listofdata) 

    print(str.format("El diccionario antes de ser retornado es {0}", dict)) 
    return dict 

def goDeep(childsOfElement, l): 

    for i in childsOfElement: 
     if i.nodeType != i.TEXT_NODE: 
      goDeep(i.childNodes, l) 
     else: 
      l.append(i.data) 

    return l 

def getSituation(payloadTag): 

    getText(payloadTag.childNodes) 



def getPayLoad(dom): 
    print(str.format("Tag to be processed:{0}",dom.getElementsByTagNameNS('*', 'payloadPublication')[0])) 
    getSituation(dom.getElementsByTagNameNS('*', 'payloadPublication')[0]) 


print(str.format("Verificando que el dato retornado es un diccionario, {0}, y contiene {1}", type(getPayLoad(dom3)), getPayLoad(dom3))) 
+0

必須使用lxml.etree進行嘗試嗎?和.xpath(「// * [name()='_ 0:situation']」))? –

回答

0

這裏是一個讓我收集兒童的數據的方式,感謝

import xml.etree.ElementTree as ET 

from xml.dom.minidom import * 

import requests 

print("GETTING XML...") 
resp = requests.get('http://infocar.dgt.es/datex2/dgt/SituationPublication/all/content.xml', stream = True) #XML that I need 
if resp.status_code != 200: 
    raise ApiError('GET /tasks/ {}'.format(resp.status_code)) 
print("XML RECIBIDO 200 OK") 
#resp.raw.decode_content = True 
print("GUARDANDO XML") 
with open("DGT_DATEX.xml", "wb") as handle: 
    for data in (resp.iter_content()): 
     handle.write(data) 

print("XML GUARDADO") 
print("INICIANDO PARSEO..") 
dom3 = parse("DGT_DATEX.xml") 
print(dom3)#memory dir 
print("DATEX PARSEADO") 

def getAttributeID(element): 
    return element.getAttribute('id') 

def getText(element): 
    return element.data 

def getPayLoad(dom): 
    dict = {} 
    index = 1 #esto sirve para relacionar los atributos con el situation que les corresponde 
    indexRecord = 1 #esto sirve para relacionar los atributos con el situationRecord que les corresponde 
    for i in dom.getElementsByTagNameNS('*', 'situation'): 
     #Por cada situation del XML vamos a sacar el situation id y todos los campos que pertecen a este de la siguiente manera 
     print(str.format("Situation ID: {0} numero {1}", getAttributeID(i), index)) 
     print(getText(dom.getElementsByTagNameNS('*','confidentiality')[index].firstChild))#por ejemplo aquí, se coge el first text de la lista de atributos confidentiality dado el index, que nos indica la relacion con el situation 
     print(getText(dom.getElementsByTagNameNS('*', 'informationStatus')[index].firstChild)) 
     for record in dom.getElementsByTagNameNS('*', 'situation')[index].childNodes:#buscamos el hijo del corespondiente situation que tenga un ID, lo que nos deveulve elsituationRecord 
      if record.nodeType != record.TEXT_NODE: 
       print(str.format("SituationRecord ID: {0} numero {1}", getAttributeID(record), indexRecord)) 
       print(getText(dom.getElementsByTagNameNS('*', 'situationRecordCreationReference')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'situationRecordCreationTime')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'situationRecordVersion')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'situationRecordVersionTime')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'situationRecordFirstSupplierVersionTime')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'probabilityOfOccurrence')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'sourceCountry')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'sourceIdentification')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'validityStatus')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'overallStartTime')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'overallEndTime')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'impactOnTraffic')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'locationDescriptor')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'tpegDirection')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'latitude')[indexRecord].firstChild)) 
       print(getText(dom.getElementsByTagNameNS('*', 'longitude')[indexRecord].firstChild)) 
       print(str.format("VALUE FIELD: {0}", getText(dom.getElementsByTagNameNS('*', 'descriptor')[indexRecord].firstChild))) 
       indexRecord = indexRecord + 1 
     index = index + 1 

getPayLoad(dom3) 
1

我來到這個代碼,是不是你要找的?

def getText(element): 
    return element.data.encode('utf-8').strip() 


def getPayLoad(dom): 
    attrs = ['confidentiality', 'informationStatus', 'situationRecordCreationReference', 'situationRecordCreationTime', 'situationRecordVersion', 'situationRecordVersionTime', 'situationRecordFirstSupplierVersionTime', 'probabilityOfOccurrence', 'sourceCountry', 'sourceIdentification', 'validityStatus', 'overallStartTime', 'overallEndTime', 'impactOnTraffic', 'locationDescriptor', 'tpegDirection', 'latitude', 'longitude', 'tpegDescriptorType', 'from'] 

    for index, node in enumerate(dom.getElementsByTagNameNS('*', 'situation'), 1): 
     print("\nSituation ID: {0} numero {1}".format(getAttributeID(node), index)) 
     for attr in attrs: 
      key = node.getElementsByTagNameNS('*', attr) 
      if key: 
       value = getText(key[0].firstChild) 
       if value: 
        print('{0}: {1}'.format(attr, value)) 
+0

感謝您的智能代碼,它給了我一個很好的觀點 – Datex2