我開發了一個處理CSV文件的腳本&生成另一個結果文件。腳本運行成功與有限的測試數據,但是當我用15個列中有2500萬行的實際數據文件執行它時,相同的腳本被絞死並突然關閉。看到附加的錯誤屏幕截圖。Python.exe在運行具有大熊貓和列表執行腳本時被掛起
那麼,是否有任何最大的限制,我可以閱讀使用從CSV文件熊貓或最大限制在列表中存儲記錄..?
請分享您的想法來優化下面的腳本。
[]
下面是腳本。
import csv
import operator
import pandas as pd
import time
print time.strftime('Script Start Time : ' + "%Y-%m-%d %H:%M:%S")
sourceFile = raw_input('Enter file name along with path : ')
searchParam1 = raw_input('Enter first column name containing MSISDN : ').lower()
searchParam2 = raw_input('Enter second column name containing DATE-TIME : ').lower()
searchParam3 = raw_input('Enter file seperator (,/#/|/:/;) : ')
df = pd.read_csv(sourceFile, sep=searchParam3)
df.columns = df.columns.str.lower()
df = df.rename(columns={searchParam1 : 'msisdn', searchParam2 : 'datetime'})
destFileWritter = csv.writer(open(sourceFile + ' - ProcessedFile.csv','wb'))
destFileWritter.writerow(df.keys().tolist())
sortedcsvList = df.sort_values(['msisdn','datetime']).values.tolist()
rows = [row for row in sortedcsvList]
col_1 = [row[df.columns.get_loc('msisdn')] for row in rows]
col_2 = [row[df.columns.get_loc('datetime')] for row in rows]
for i in range(0,len(col_1)-1):
if col_1[i] == col_1[i+1]:
#print('Inside If...')
continue
else:
for row in rows:
if col_1[i] in row:
if col_2[i] in row:
#print('Inside else...')
destFileWritter.writerow(row)
destFileWritter.writerow(rows[len(rows)-1])
print('Processing Completed, Kindly Check Response File On Same Location.')
print time.strftime('Script End Time : ' + "%Y-%m-%d %H:%M:%S")
raw_input('Press Enter to Exit...')[![enter image description here][1]][1]
更新後的腳本:
import csv
import operator
import pandas as pd
import time
import sys
print time.strftime('Script Start Time : ' + "%Y-%m-%d %H:%M:%S")
sourceFile = raw_input('Enter file name along with path : ')
searchParam1 = raw_input('Enter first column name containing MSISDN : ').lower()
searchParam2 = raw_input('Enter second column name containing DATE-TIME : ').lower()
searchParam3 = raw_input('Enter file seperator (,/#/|/:/;) : ')
def csvSortingFunc(sourceFile, searchParam1, searchParam2, searchParam3):
CHUNKSIZE = 10000
for chunk in pd.read_csv(sourceFile, chunksize=CHUNKSIZE, sep=searchParam3):
df = chunk
#df = pd.read_csv(sourceFile, sep=searchParam3)
df.columns = df.columns.str.lower()
df = df.rename(columns={searchParam1 : 'msisdn', searchParam2 : 'datetime'})
"""destFileWritter = csv.writer(open(sourceFile + ' - ProcessedFile.csv','wb'))
destFileWritter.writerow(df.keys().tolist()) """
resultList = []
resultList.append(df.keys().tolist())
sortedcsvList = df.sort_values(['msisdn','datetime']).values.tolist()
rows = [row for row in sortedcsvList]
col_1 = [row[df.columns.get_loc('msisdn')] for row in rows]
col_2 = [row[df.columns.get_loc('datetime')] for row in rows]
for i in range(0,len(col_1)-1):
if col_1[i] == col_1[i+1]:
#print('Inside If...')
continue
else:
for row in rows:
if col_1[i] in row:
if col_2[i] in row:
#print('Inside else...')
#destFileWritter.writerow(row)
resultList.append(row)
#destFileWritter.writerow(rows[len(rows)-1])
resultList.append(rows[len(rows)-1])
writedf = pd.DataFrame(resultList)
writedf.to_csv(sourceFile + ' - ProcessedFile.csv', header=False, index=False)
#print('Processing Completed, Kindly Check Response File On Same Location.')
csvSortingFunc(sourceFile, searchParam1, searchParam2, searchParam3)
print('Processing Completed, Kindly Check Response File On Same Location.')
print time.strftime('Script End Time : ' + "%Y-%m-%d %H:%M:%S")
raw_input('Press Enter to Exit...')
也許有些內存限制?你檢查了嗎? – Paddy