2016-11-17 49 views
0
"SURNAME","GIVENNAME","MIDDLENAME","UPIN","NAME","CODE" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770 

讓我們假設這是大文件的格式,我想分割成許多具有指定大小的文件,並且在每個文件中我需要標題(「SURNAME」,「GIVENNAME」,「MIDDLENAME 「,」UPIN「,」NAME「,」CODE「)出現。提前感謝。我想將一個巨大的文件分割成許多文件,並在所有分割文件中使用頭文件。使用python


import os 
import sys 

def getfilesize(filename): 
    with open(filename,"rb") as fr: 
     fr.seek(0,2) # move to end of the file 
     size=fr.tell() 
     print("getfilesize: size: %s" % size) 
     return fr.tell() 

def splitfile(filename, splitsize): 
    # Open original file in read only mode 
    if not os.path.isfile(filename): 
     print("No such file as: \"%s\"" % filename) 
     return 

    filesize=getfilesize(filename) 
    with open(filename,"rb") as fr: 
    counter=1 
    orginalfilename = filename.split(".") 
    readlimit = 1000000 #read 5kb at a time 
    n_splits = filesize//splitsize 
    print("splitfile: No of splits required: %s" % str(n_splits)) 
    for i in range(n_splits+1): 
     chunks_count = int(splitsize)//int(readlimit) 
     data_5kb = fr.read(readlimit) # read 
     # Create split files 
     print("chunks_count: %d" % chunks_count) 
     with open(orginalfilename[0]+"_{id}.".format(id=str(counter))+orginalfilename[1],"ab") as fw: 
      fw.seek(0) 
      fw.truncate()# truncate original if present 
      while data_5kb:     
       fw.write(data_5kb) 
       if chunks_count: 
        chunks_count-=1 
        data_5kb = fr.read(readlimit) 
       else: break    
     counter+=1 

if __name__ == "__main__": 
    if len(sys.argv) < 3: print("Filename or splitsize not provided: Usage:  filesplit.py filename splitsizeinkb ") 
    else: 
     filesize = int(sys.argv[2]) * 1000 #make into kb 
     filename = sys.argv[1] 
     splitfile(filename, filesize) 

這工作正常,但無法得到的頭,我很抱歉,我是新來的#1。

+0

你嘗試過什麼嗎?請發佈您的嘗試。還有什麼樣的文件分裂成這些?什麼是標準,你能提供樣品嗎? – EoinS

+0

這裏有什麼問題? – marcadian

+0

它必須在Python嗎?其他方法可能會更快。無論如何,告訴我們你試過了什麼。 –

回答

0

這應該這樣做

import os 

maxlines = 1000 # how many lines did you want each new file to have? 
infilepath = 'path/to/file' 
with open(infilepath) as infile: 
    dirpath = os.path.dirname(infilepath) 
    fname = os.path.basename(infilepath) 
    fname, ext = fname.rsplit('.',1) 

    header = infile.readline() 
    outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, 0, ext)), 'w') 

    for i,line in enumerate(infile): 
     if not i%maxlines: 
      outfile.close() 
      outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, i//maxlines, ext)), 'w') 
      outfile.write(header) 
     outfile.write(line) 

    try: outfile.close() 
    except: pass 
+0

謝謝inspectorG4dget,但我得到錯誤Traceback(最近呼叫最後): 文件「C:/Users/Henry/Desktop/G_Scripts/Py/split_new.py」,第7行,在 outfile.close() NameError:名稱'outfile'沒有定義,我是新來的Stackoverflow和python的基本知識,提前致謝 – GOU7HAM

+0

@ GOU7HAM:什麼是錯誤 – inspectorG4dget

+0

回溯(最近一次調用最後一次): 文件「C:/Users/Henry/Desktop/G_Scripts/Py/split_new.py」,第7行,在 outfile.close() NameError:name'outfile'is not de罰款 – GOU7HAM

2

我用熊貓大文件分割成更小的

import pandas as pd 

infile = #path to your file 

n=0 
for chunk in pd.read_csv(infile, sep = ',', chunksize=1000000): 
    data = chunk 
    oPath = 'chunk_' +str(n)+'.csv' 
    data.to_csv(oPath, sep=' ',index=False, header=true) 
    n +=1 

chunksize表明您在輸出文件要多少行。