理解你可以把你所有的數據幀中的列表,然後使用reduce
要麼append
或merge
他們。 看看減少here。
首先,爲樣本數據生成定義一些函數。
import pandas
import numpy as np
# GENERATE DATA
# Code 1 between 13 and 15
def generate_code_1(n):
return np.floor(np.random.rand(n,1) * 3 + 13)
# Code 2 between 1 and 1000
def generate_code_2(n):
return np.floor(np.random.rand(n,1) * 1000) + 1
# Distance between 0 and 9
def generate_distance(n):
return np.floor(np.random.rand(n,1) * 10)
# Generate a data frame as hstack of 3 arrays
def generate_data_frame(n):
data = np.hstack([
generate_code_1(n)
,generate_code_2(n)
,generate_distance(n)
])
df = pandas.DataFrame(data=data, columns=['Code 1', 'Code 2', 'Distance'])
# Remove possible duplications of Code 1 and Code 2. Take smallest distance in case of duplications.
# Duplications will break merge method however will not break append method
df = df.groupby(['Code 1', 'Code 2'], as_index=False)
df = df.aggregate(np.min)
return df
# Generate n data frames each with m rows in a list
def generate_data_frames(n, m, with_count=False):
df_list = []
for k in range(0, n):
df = generate_data_frame(m)
# Add count column, needed for merge method to keep track of how many cases we have seen
if with_count:
df['Count'] = 1
df_list.append(df)
return df_list
Append方法(更快,更短,更好)
df_list = generate_data_frames(94, 5)
# Append all data frames together using reduce
df_append = reduce(lambda df_1, df_2 : df_1.append(df_2), df_list)
# Aggregate by Code 1 and Code 2
df_append_grouped = df_append.groupby(['Code 1', 'Code 2'], as_index=False)
df_append_result = df_append_grouped.aggregate(np.mean)
df_append_result
合併方法
df_list = generate_data_frames(94, 5, with_count=True)
# Function to be passed to reduce. Merge 2 data frames and update Distance and Count
def merge_dfs(df_1, df_2):
df = pandas.merge(df_1, df_2, on=['Code 1', 'Code 2'], how='outer', suffixes=('', '_y'))
df = df.fillna(0)
df['Distance'] = df['Distance'] + df['Distance_y']
df['Count'] = df['Count'] + df['Count_y']
del df['Distance_y']
del df['Count_y']
return df
# Use reduce to apply merge over the list of data frames
df_merge_result = reduce(merge_dfs, df_list)
# Replace distance with its mean and drop Count
df_merge_result['Distance'] = df_merge_result['Distance']/df_merge_result['Count']
del df_merge_result['Count']
df_merge_result
難道你'Code1'和'Code2'在一個數據幀一樣嗎? –
我不確定我是否理解,如果Code1和Code2匹配,您想要添加距離列的值,彼此之間?在df之間?獨立的指數?另外,如果你有N個DataFrame都具有相同的列,爲什麼你不能只用一個大的df來處理所有的數據並且使用像sum col這樣的條件? – nico
@AntonProtopopov,是的,可能是一樣的。 –