2017-03-16 54 views
0

我已,我想在R.如何解析XML中的R具有不同數目的子節點和多個節點具有相同名稱的

<?xml version="1.0" encoding="UTF-8"?><CONSOLIDATED_LIST xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://www.un.org/sc/resources/sc-sanctions.xsd" dateGenerated="2016-12-21T19:09:23.456-05:00"> 
     <INDIVIDUALS> 
      <INDIVIDUAL> 
       <DATAID>6908434</DATAID> 
       <VERSIONNUM>1</VERSIONNUM> 
       <FIRST_NAME>ABD</FIRST_NAME> 
       <SECOND_NAME>AL-KHALIQ</SECOND_NAME> 
       <THIRD_NAME> AL-HOUTHI </THIRD_NAME> 
       <UN_LIST_TYPE>Yemen</UN_LIST_TYPE> 
       <REFERENCE_NUMBER>YEi.001</REFERENCE_NUMBER> 
       <LISTED_ON>2014-11-07</LISTED_ON> 
       <NAME_ORIGINAL_SCRIPT>عبدالخالق الحوثي</NAME_ORIGINAL_SCRIPT> 
       <COMMENTS1>Gender [Male].</COMMENTS1> 
       <DESIGNATION> 
        <VALUE>Huthi military commander</VALUE> 
       </DESIGNATION> 
       <NATIONALITY> 
        <VALUE>Yemen</VALUE> 
       </NATIONALITY> 
       <LIST_TYPE> 
        <VALUE>UN List</VALUE> 
       </LIST_TYPE> 
       <LAST_DAY_UPDATED> 
        <VALUE>2014-11-20</VALUE> 
        <VALUE>2016-08-26</VALUE> 
       </LAST_DAY_UPDATED> 
       <INDIVIDUAL_ALIAS> 
        <QUALITY>Good</QUALITY> 
        <ALIAS_NAME>Abd-al-Khaliq al-Huthi</ALIAS_NAME> 
       </INDIVIDUAL_ALIAS> 
       <INDIVIDUAL_ALIAS> 
        <QUALITY>Good</QUALITY> 
        <ALIAS_NAME>Abd-al-Khaliq Badr-al-Din al Huthi</ALIAS_NAME> 
       </INDIVIDUAL_ALIAS> 
       <INDIVIDUAL_ALIAS> 
        <QUALITY>Good</QUALITY> 
        <ALIAS_NAME>‘Abd al-Khaliq Badr al-Din al-Huthi</ALIAS_NAME> 
       </INDIVIDUAL_ALIAS> 
       <INDIVIDUAL_ALIAS> 
        <QUALITY>Good</QUALITY> 
        <ALIAS_NAME>Abd al-Khaliq al-Huthi </ALIAS_NAME> 
       </INDIVIDUAL_ALIAS> 
       <INDIVIDUAL_ALIAS> 
        <QUALITY>Low</QUALITY> 
        <ALIAS_NAME>Abu-Yunus</ALIAS_NAME> 
       </INDIVIDUAL_ALIAS> 
       <INDIVIDUAL_ADDRESS> 
        <COUNTRY/> 
       </INDIVIDUAL_ADDRESS> 
       <INDIVIDUAL_DATE_OF_BIRTH> 
        <TYPE_OF_DATE>EXACT</TYPE_OF_DATE> 
        <YEAR>1984</YEAR> 
       </INDIVIDUAL_DATE_OF_BIRTH> 
       <INDIVIDUAL_PLACE_OF_BIRTH/> 
       <INDIVIDUAL_DOCUMENT/> 
       <SORT_KEY/> 
       <SORT_KEY_LAST_MOD/> 
      </INDIVIDUAL> 

    </CONSOLIDATED_LIST> 

所需的輸出來解析下面的XML文件如下:

--------------------------------------------------------------------------- 
    DATAID | FIRST_NAME | SECOND_NAME | THIRD_NAME | FOURTH_NAME | ALIAS_NAME | QUALITY 
    --------------------------------------------------------------------------- 
6908434 | ABD | AL-KHALIQ | AL-HOUTHI | NA | Abd-al-Khaliq al-Huthi | Good 
----------------------------------------------------------------------------- 
6908434 | ABD | AL-KHALIQ | AL-HOUTHI | NA | Abd-al-Khaliq Badr-al-Din al Huthi | Good 
----------------------------------------------------------------------------- 
6908434 | ABD | AL-KHALIQ | AL-HOUTHI | NA | ‘Abd al-Khaliq Badr al-Din al-Huthi | Good 
----------------------------------------------------------------------------- 
6908434 | ABD | AL-KHALIQ | AL-HOUTHI | NA | Abd al-Khaliq al-Huthi | Good 
----------------------------------------------------------------------------- 
6908434 | ABD | AL-KHALIQ | AL-HOUTHI | NA | Abu-Yunus | Low 
----------------------------------------------------------------------------- 

一個問題是某些條目沒有THIRD_NAME和FOURTH_NAME。任何幫助表示讚賞,謝謝。

必須使用下面的代碼嘗試:

result <- do.call(rbind,lapply(individuals,function(individual){ 
    DATAID  <- xmlValue(individual["DATAID"][[1]]) 
    FIRST_NAME <- xmlValue(individual["FIRST_NAME"][[1]]) 
    SECOND_NAME <- xmlValue(individual["SECOND_NAME"][[1]]) 
    THIRD_NAME <- xmlValue(individual["THIRD_NAME"][[1]]) 
    FOURTH_NAME <- xmlValue(individual["FOURTH_NAME"][[1]]) 
    c(DATAID = DATAID, FIRST_NAME = FIRST_NAME) 
})) 
result <- data.frame(result) 

但隨着要麼沒有third_name,fourth_name,也不能確定如何獲得alias_names失敗。

回答

0

看來,下面的代碼工作:

un_xml <- xmlParse("~/Downloads/consolidated.xml") 
entries <- length(xpathSApply(un_xml, "//INDIVIDUAL")) 

getNameValue <- function(i_or_e,Node,i) { 
    if (length(sapply(un_xml[paste0("//",i_or_e,"[",i,"]/",Node)], xmlName)) > 0) { 
     NAME = sapply(un_xml[paste0("//",i_or_e,"[",i,"]/",Node)], xmlValue) 
    } else { 
     NAME = '' 
    } 
    NAME 
} 

getAliasValue <- function(i_or_e,Node,i) { 
    if (length(sapply(un_xml[paste0("//",i_or_e,"[",i,"]/*/",Node)], xmlName)) > 0) { 
     NAME = sapply(un_xml[paste0("//",i_or_e,"[",i,"]/*/",Node)], xmlValue) 
    } else { 
     NAME = '' 
    } 
    NAME 
} 

getUNListType <- function(i_or_e,i) { 
    if (length(sapply(un_xml[paste0("//",i_or_e,"[",i,"]/UN_LIST_TYPE")], xmlName)) > 0) { 
     NAME = sapply(un_xml[paste0("//",i_or_e,"[",i,"]/UN_LIST_TYPE")], xmlValue) 
    } else { 
     NAME = '' 
    } 
    NAME 
} 

getListType <- function(i_or_e,i) { 
    if (length(sapply(un_xml[paste0("//",i_or_e,"[",i,"]/LIST_TYPE/VALUE")], xmlName)) > 0) { 
     NAME = sapply(un_xml[paste0("//",i_or_e,"[",i,"]/LIST_TYPE/VALUE")], xmlValue) 
    } else { 
     NAME = '' 
    } 
    NAME 
} 


individuals <- lapply(seq(entries), function(i){ 
    data.frame(
     ID   = sapply(un_xml[paste0("//INDIVIDUAL[",i,"]/DATAID")], xmlValue), 
     FIRST_NAME = sapply(un_xml[paste0("//INDIVIDUAL[",i,"]/FIRST_NAME")], xmlValue), 
     SECOND_NAME = getNameValue("INDIVIDUAL", "SECOND_NAME", i), 
     THIRD_NAME = getNameValue("INDIVIDUAL", "THIRD_NAME", i), 
     FOURTH_NAME = getNameValue("INDIVIDUAL", "FOURTH_NAME", i), 
     ALIAS_NAME = getAliasValue("INDIVIDUAL", "ALIAS_NAME", i), 
     QUALITY  = getAliasValue("INDIVIDUAL", "QUALITY", i), 
     UN_LIST_TYPE = getUNListType("INDIVIDUAL", i), 
     LIST_TYPE = getListType("INDIVIDUAL", i) 
    ) 
}) 

individuals_df <- do.call(rbind, individuals) 

entries <- length(xpathSApply(un_xml, "//ENTITY")) 
entities <- lapply(seq(entries), function(i){ 
    data.frame(
     ID   = sapply(un_xml[paste0("//ENTITY[",i,"]/DATAID")], xmlValue), 
     FIRST_NAME = getNameValue("ENTITY", "FIRST_NAME", i), 
     SECOND_NAME = getNameValue("ENTITY", "SECOND_NAME", i), 
     THIRD_NAME = getNameValue("ENTITY", "THIRD_NAME", i), 
     FOURTH_NAME = getNameValue("ENTITY", "FOURTH_NAME", i), 
     ALIAS_NAME = getAliasValue("ENTITY", "ALIAS_NAME", i), 
     QUALITY  = getAliasValue("ENTITY", "QUALITY", i), 
     UN_LIST_TYPE = getUNListType("ENTITY", i), 
     LIST_TYPE = getListType("ENTITY", i)   
    ) 
}) 

entities_df <- do.call(rbind, entities) 
0

考慮與XSLT改變你的原始XML,旨在爲XML文件轉換成其它XML的專用語言,HTML,甚至文本(即CSV ,標籤)文件。特別是,爲了滿足您長時間格式的需求,請找到<INDIVIDUAL_ALIAS>並遷移到上述祖先節點<INDIVIDUAL>上的指標。一旦轉換後,您可以使用XML的xpathSApply()調用或xmlToDataFrame()輕鬆導入。使用XSLT的另一個有價值的理由是您可以創建可以填充或不填充的空節點,例如可選的<THIRD_NAME><FOURTH_NAME>元素。

唯一的挑戰是R在跨操作系統的XSLT轉換中沒有通用的包。但是,R可以通過命令行調用這種XSLT處理器,其編號爲system()。大多數OS都帶有xslt處理器和通用語言(C#,Java,Python,Perl,VB)來維護這些庫。在大多數Linux/Mac發行版上,可以使用xsltproc。對於Windows,您可以運行到MSXML的COM調用。當然,您可以使用任何其他第三方處理器,如Saxon, Xalan, etc.

XSLT(保存爲文件的.xsl符合--fully XML file--在命令行調用)

<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> 
<xsl:output version="1.0" encoding="UTF-8" indent="yes" method="xml"/> 
<xsl:strip-space elements="*"/> 

    <xsl:template match="/CONSOLIDATED_LIST"> 
    <xsl:copy> 
     <xsl:apply-templates select="INDIVIDUALS"/> 
    </xsl:copy> 
    </xsl:template> 

    <xsl:template match="INDIVIDUALS">  
    <xsl:apply-templates select="INDIVIDUAL"/>  
    </xsl:template> 

    <xsl:template match="INDIVIDUAL">  
    <xsl:apply-templates select="INDIVIDUAL_ALIAS"/>  
    </xsl:template> 

    <xsl:template match="INDIVIDUAL_ALIAS"> 
    <INDIVIDUAL> 
     <xsl:copy-of select="ancestor::INDIVIDUAL/DATAID"/> 
     <xsl:copy-of select="ancestor::INDIVIDUAL/FIRST_NAME"/> 
     <xsl:copy-of select="ancestor::INDIVIDUAL/SECOND_NAME"/> 
     <THIRD_NAME><xsl:value-of select="ancestor::INDIVIDUAL/THIRD_NAME"/></THIRD_NAME> 
     <FOURTH_NAME><xsl:value-of select="ancestor::INDIVIDUAL/FOURTH_NAME"/></FOURTH_NAME> 
     <ALIAS><xsl:value-of select="ALIAS_NAME"/></ALIAS> 
     <QUALITY><xsl:value-of select="QUALITY"/></QUALITY> 
    </INDIVIDUAL> 
    </xsl:template> 

</xsl:transform> 

ř(對於Linux/Mac的使用命令行xsltproc的)

system("xsltproc /path/to/xslt_script.xsl path/to/input.xml -o /path/to/output.xml") 

ř(適用於Windows使用MSXML COM對象)

library(RDCOMClient) 

xmlfile <- "C:/path/to/Input.xml" 
xslfile <- "C:/path/to/XSLT_Script.xsl" 
output <- "C:/path/to/Output.xml" 

xmlDoc <- COMCreate("MSXML2.DOMDocument") 
xslDoc <- COMCreate("MSXML2.DOMDocument") 
newDoc <- COMCreate("MSXML2.DOMDocument") 

xmlDoc$Load(xmlfile) 
xslDoc$Load(xslfile) 

xmlDoc$transformNodeToObject(xslDoc, newDoc) 
newDoc$Save(output) 

xmlDoc <- NULL; xslDoc <- NULL; newDoc <- NULL 

[R

library(XML) 

doc <- xmlParse("/path/to/output.xml") 

# XPATHSAPPLY APPROACH ------------------------- 
df <- data.frame( 
    DATAID = xpathSApply(doc, "//DATAID", xmlValue), 
    FIRST_NAME = xpathSApply(doc, "//FIRST_NAME", xmlValue), 
    SECOND_NAME = xpathSApply(doc, "//SECOND_NAME", xmlValue), 
    THIRD_NAME = xpathSApply(doc, "//THIRD_NAME", xmlValue), 
    FOURTH_NAME = xpathSApply(doc, "//FOURTH_NAME", xmlValue), 
    ALIAS = xpathSApply(doc, "//ALIAS", xmlValue), 
    QUALITY = xpathSApply(doc, "//QUALITY", xmlValue), 
    stringsAsFactors = FALSE 
) 
# REPLACE EMPTY STRING WITH NAs 
df <- data.frame(sapply(df, function(col) ifelse(col=="", NA, col))) 

# XMLTODATAFRAME APPROACH ------------------------- 
df2 <- xmlToDataFrame(doc, nodes=getNodeSet(doc, "//INDIVIDUAL"), stringsAsFactors = FALSE) 
# REPLACE EMPTY STRING WITH NAs 
df2 <- data.frame(sapply(df2, function(col) ifelse(col=="", NA, col))) 

all.equal(df, df2) 
# [1] TRUE 

df2 
# DATAID FIRST_NAME SECOND_NAME THIRD_NAME FOURTH_NAME        ALIAS QUALITY 
# 1 6908434  ABD AL-KHALIQ AL-HOUTHI   <NA>    Abd-al-Khaliq al-Huthi Good 
# 2 6908434  ABD AL-KHALIQ AL-HOUTHI   <NA> Abd-al-Khaliq Badr-al-Din al Huthi Good 
# 3 6908434  ABD AL-KHALIQ AL-HOUTHI   <NA> Abd al-Khaliq Badr al-Din al-Huthi Good 
# 4 6908434  ABD AL-KHALIQ AL-HOUTHI   <NA>    Abd al-Khaliq al-Huthi Good 
# 5 6908434  ABD AL-KHALIQ AL-HOUTHI   <NA>       Abu-Yunus  Low 
+0

感謝芭菲(上述轉變,有兩種方法後),這是很大的幫助。 –

相關問題