在数据挖掘任务中,特征工程占据相当大的工作量。最近做唯品会购物预测的比赛中,发现生成特征时候,使用list的append方法去连接不同样本的特征比用pandas的concat方法效率提高很大。然后再尝试用并行的方式对concat方法进行优化,发现无任何提高,不知道是不是姿势不对。 把代码粘贴一下,记录一下,tt是抽取的一个(100000,4)大小的dataframe进行运行时间的比较
1.append生成特征集
start = time()
X = []
for each_uid,group in tt.groupby(by=['uid']):
time_alpha = datetime.date(1900,3,20)
time_alpha2 = datetime.date(1900,4,1)
#df = pd.Series({'uid':each_uid})
user_activity = group.shape[0] #用户活跃度user_activity :总浏览次数
user_buyablity = group[group['action_type'] == 1].shape[0] #用户购买力:总购买数量
user_takerate = user_buyablity/user_activity #转化率
user_near_activity = 0 #最近10天的活跃度
for i in group['date']:
if(i.date() > time_alpha):
user_near_activity += 1
user_first_time = min(group['date']).date()#第一次活跃时间
user_last_time = max(group['date']).date()#最后一次活跃时间
user_timedalta = (user_last_time - user_first_time).days#用户总活跃时间差
user_near_timedelta = (time_alpha2 - user_last_time).days#用户最有一次活跃距离4.1的天数
X1 = [each_uid,user_activity,user_buyablity,user_takerate,user_near_activity,user_timedalta,user_near_timedelta]
X.append(X1)
names = ['each_uid','user_activity','user_buyablity','user_takerate','user_near_activity','user_timedalta','user_near_timedelta']
X = pd.DataFrame(X,columns=names)
stop = time()
print(str(stop-start) + "秒")
运行时间:42.0529999733秒
2.concat生成特征集
start = time()
X = pd.DataFrame()
for each_uid,group in tt.groupby(by=['uid']):
time_alpha = datetime.date(1900,3,20)
time_alpha2 = datetime.date(1900,4,1)
df = pd.Series({'uid':each_uid})
user_activity = group.shape[0] #用户活跃度user_activity :总浏览次数
user_buyablity = group[group['action_type'] == 1].shape[0] #用户购买力:总购买数量
user_takerate = user_buyablity/user_activity #转化率
user_near_activity = 0 #最近10天的活跃度
for i in group['date']:
if(i.date() > time_alpha):
user_near_activity += 1
user_first_time = min(group['date']).date()#第一次活跃时间
user_last_time = max(group['date']).date()#最后一次活跃时间
user_timedalta = (user_last_time - user_first_time).days#用户总活跃时间差
user_near_timedelta = (time_alpha2 - user_last_time).days#用户最有一次活跃距离4.1的天数
df['user_activity'] = user_activity
df['user_buyablity'] = user_buyablity
df['user_takerate'] = user_takerate
df['user_near_activity'] = user_near_activity
df['user_timedalta'] = user_timedalta
df['user_near_timedelta'] = user_near_timedelta
X = pd.concat([X,df.to_frame().T], axis=0)
stop = time()
print(str(stop-start) + "秒")
运行时间:284.087000132秒
3.并行处理
def applyParallel(dfGrouped,func):
with Parallel(n_jobs=32) as parallel:
retLst = parallel(delayed(func)(group) for name, group in dfGrouped )
return pd.concat(retLst, axis=0)
#user features user_num = 196030
#style 3:
def getUserFeatures(group):
time_alpha = datetime.date(1900,3,20)
time_alpha2 = datetime.date(1900,4,1)
df = pd.Series({'uid':group.iloc[0]['uid']})
user_activity = group.shape[0] #用户活跃度user_activity :总浏览次数
user_buyablity = group[group['action_type'] == 1].shape[0] #用户购买力:总购买数量
user_takerate = user_buyablity/user_activity #转化率
user_near_activity = 0 #最近10天的活跃度
for i in group['date']:
if(i.date() > time_alpha):
user_near_activity += 1
user_first_time = min(group['date']).date()#第一次活跃时间
user_last_time = max(group['date']).date()#最后一次活跃时间
user_timedalta = (user_last_time - user_first_time).days#用户总活跃时间差
user_near_timedelta = (time_alpha2 - user_last_time).days#用户最有一次活跃距离4.1的天数
df['user_activity'] = user_activity
df['user_buyablity'] = user_buyablity
df['user_takerate'] = user_takerate
df['user_near_activity'] = user_near_activity
df['user_timedalta'] = user_timedalta
df['user_near_timedelta'] = user_near_timedelta
return df.to_frame().T
X = applyParallel(tt.groupby(by=['uid']),getUserFeatures)
运行时间:卡住