2017-08-24 73 views
-3

說我有一個CSV與數百列(逗號分隔)文件:如何合併幾百列在CSV與Python文件

ID,Column1,Column2,...,Column700 
1,data,,..., 
2,,data,..., 
... 
700,,,...,data 

如何合併列,這樣我都一列中的「數據」? CSV文件是從JSON文件中創建從這裏:https://nvd.nist.gov/vuln/data-feeds#JSON_FEED

這裏是我使用(從另一個SO後)從JSON轉換爲CSV代碼:

def to_string(s): 
    try: 
     return str(s) 
    except: 
     #Change the encoding type if needed 
     return s.encode('utf-8') 

def reduce_item(key, value): 
    global reduced_item 

    #Reduction Condition 1 
    if type(value) is list: 
     i=0 
     for sub_item in value: 
      reduce_item(key+'_'+to_string(i), sub_item) 
      i=i+1 

    #Reduction Condition 2 
    elif type(value) is dict: 
     sub_keys = value.keys() 
     for sub_key in sub_keys: 
      reduce_item(key+'_'+to_string(sub_key), value[sub_key]) 

    #Base Condition 
    else: 
     reduced_item[to_string(key)] = to_string(value) 

if __name__ == "__main__": 
    if len(sys.argv) == 4: #original was != 
     print ("\nUsage: python json_to_csv.py <node_name> <json_in_file_path> <csv_out_file_path>\n") 
     #print ("Trying this without command line arguments") 
    else: 
    #Reading arguments 
     #node = sys.argv[1] 
     #json_file_path = sys.argv[2] 
     #csv_file_path = sys.argv[3] 

     node = "CVE_Items" 
     json_file_path = "some\file.json" 
     csv_file_path = "some\file.csv" 

     fp = open(json_file_path, 'r') 
     json_value = fp.read() 
     raw_data = json.loads(json_value) 

     try: 
      data_to_be_processed = raw_data[node] 
     except: 
      data_to_be_processed = raw_data 

     processed_data = [] 
     header = [] 
     for item in data_to_be_processed: 
      reduced_item = {} 
      reduce_item(node, item) 

      header += reduced_item.keys() 

      processed_data.append(reduced_item) 

     header = list(set(header)) 
     header.sort() 

     with open(csv_file_path, 'w', newline='') as f: 
      writer = csv.DictWriter(f, header, quoting=csv.QUOTE_ALL) 
      writer.writeheader() 
      for row in processed_data: 
       writer.writerow(row) 

,這裏是一個樣本從JSON文件條目:

{ 
    "CVE_data_type" : "CVE", 
    "CVE_data_format" : "MITRE", 
    "CVE_data_version" : "4.0", 
    "CVE_data_numberOfCVEs" : "6208", 
    "CVE_data_timestamp" : "2017-08-14T18:06Z", 
    "CVE_Items" : [ { 
    "cve" : { 
     "CVE_data_meta" : { 
     "ID" : "CVE-2003-1547" 
     }, 
     "affects" : { 
     "vendor" : { 
      "vendor_data" : [ { 
      "vendor_name" : "francisco_burzi", 
      "product" : { 
       "product_data" : [ { 
       "product_name" : "php-nuke", 
       "version" : { 
        "version_data" : [ { 
        "version_value" : "6.5" 
        }, { 
        "version_value" : "6.5_beta1" 
        }, { 
        "version_value" : "6.5_rc3" 
        }, { 
        "version_value" : "6.5_rc2" 
        }, { 
        "version_value" : "6.5_rc1" 
        } ] 
       } 
       } ] 
      } 
      } ] 
     } 
     }, 
     "problemtype" : { 
     "problemtype_data" : [ { 
      "description" : [ { 
      "lang" : "en", 
      "value" : "CWE-79" 
      } ] 
     } ] 
     }, 
     "references" : { 
     "reference_data" : [ { 
      "url" : "http://secunia.com/advisories/8478" 
     }, { 
      "url" : "http://securityreason.com/securityalert/3718" 
     }, { 
      "url" : "http://www.securityfocus.com/archive/1/archive/1/316925/30/25250/threaded" 
     }, { 
      "url" : "http://www.securityfocus.com/archive/1/archive/1/317230/30/25220/threaded" 
     }, { 
      "url" : "http://www.securityfocus.com/bid/7248" 
     }, { 
      "url" : "https://exchange.xforce.ibmcloud.com/vulnerabilities/11675" 
     } ] 
     }, 
     "description" : { 
     "description_data" : [ { 
      "lang" : "en", 
      "value" : "Cross-site scripting (XSS) vulnerability in block-Forums.php in the Splatt Forum module for PHP-Nuke 6.x allows remote attackers to inject arbitrary web script or HTML via the subject parameter." 
     } ] 
     } 
    }, 
    "configurations" : { 
     "CVE_data_version" : "4.0", 
     "nodes" : [ { 
     "operator" : "OR", 
     "cpe" : [ { 
      "vulnerable" : true, 
      "cpeMatchString" : "cpe:/a:francisco_burzi:php-nuke:6.5", 
      "cpe23Uri" : "cpe:2.3:a:francisco_burzi:php-nuke:6.5:*:*:*:*:*:*:*" 
     }, { 
      "vulnerable" : true, 
      "cpeMatchString" : "cpe:/a:francisco_burzi:php-nuke:6.5_beta1", 
      "cpe23Uri" : "cpe:2.3:a:francisco_burzi:php-nuke:6.5_beta1:*:*:*:*:*:*:*" 
     }, { 
      "vulnerable" : true, 
      "cpeMatchString" : "cpe:/a:francisco_burzi:php-nuke:6.5_rc1", 
      "cpe23Uri" : "cpe:2.3:a:francisco_burzi:php-nuke:6.5_rc1:*:*:*:*:*:*:*" 
     }, { 
      "vulnerable" : true, 
      "cpeMatchString" : "cpe:/a:francisco_burzi:php-nuke:6.5_rc2", 
      "cpe23Uri" : "cpe:2.3:a:francisco_burzi:php-nuke:6.5_rc2:*:*:*:*:*:*:*" 
     }, { 
      "vulnerable" : true, 
      "cpeMatchString" : "cpe:/a:francisco_burzi:php-nuke:6.5_rc3", 
      "cpe23Uri" : "cpe:2.3:a:francisco_burzi:php-nuke:6.5_rc3:*:*:*:*:*:*:*" 
     } ] 
     } ] 
    }, 
    "impact" : { 
     "baseMetricV2" : { 
     "cvssV2" : { 
      "vectorString" : "(AV:N/AC:M/Au:N/C:N/I:P/A:N)", 
      "accessVector" : "NETWORK", 
      "accessComplexity" : "MEDIUM", 
      "authentication" : "NONE", 
      "confidentialityImpact" : "NONE", 
      "integrityImpact" : "PARTIAL", 
      "availabilityImpact" : "NONE", 
      "baseScore" : 4.3 
     }, 
     "severity" : "MEDIUM", 
     "exploitabilityScore" : 8.6, 
     "impactScore" : 2.9, 
     "obtainAllPrivilege" : false, 
     "obtainUserPrivilege" : false, 
     "obtainOtherPrivilege" : false, 
     "userInteractionRequired" : true 
     } 
    }, 
    "publishedDate" : "2003-12-31T05:00Z", 
    "lastModifiedDate" : "2017-08-08T01:29Z" 
    } ] 
} 
+0

如果您輸入簡單的輸入和輸出示例,在課程示例中省略700列的事實將有所幫助;) –

+0

您是否有任何已在解決方案上啓動的Python代碼?爲了將JSON轉換爲CSV,您可以直接將JSON數據作爲PHP中的對象,並將其轉儲爲CSV格式,只需幾行即可。你可以在Python中做同樣的事情。 –

+0

@Aron - 根據要求添加樣品。我目前沒有使用PHP的選項。 –

回答

0

如果我理解正確的話,你有一個CSV很多很多列文件,但每個記錄只有ID和一個(非ID)柱填充和無兩個記錄具有相同(非ID)列填充。所以基本上你想把所有700條記錄「摺疊」成一個。

由於ID值全都不同,我不確定你想如何將它們合併成一條記錄。我會假設你知道在這種情況下要做什麼,所以我不打算試圖讓這個部分正確。

假設你的數據是在一個名爲nist.csv文件,你可以在記錄讀入一個merged_record這樣的:

import csv 
reader = csv.reader(open('nist.csv')) 

# Extract the header: 
header = next(reader) 

# Initialize an empty merged_record: 
merged_record = [''] * len(header) 

# Populate the merged_record with data from the CSV file: 
for record in reader: 
    for i,value in enumerate(record): 
     if value: # only overwrite if the value is non-empty 
      merged_record[i] = value 

一旦你的headermerged_record,你可以將他們輸出到一個新的CSV文件命名nist_merged.csv這樣的:

with open('nist_merged.csv', 'w') as outputfile: 
    writer = csv.writer(outputfile) 
    writer.writerow(header) 
    writer.writerow(merged_record) 

您可以減少這只是這樣一行:

csv.writer(open('nist_merged.csv', 'w')).writerows([header, merged_record]) 

此外,ID字段可能不是您想要的,但由於此新CSV文件中只有兩行,所以您應該很容易將其更改爲您想要的值。