-
Notifications
You must be signed in to change notification settings - Fork 74
/
1_svm_data.py
117 lines (73 loc) · 3.1 KB
/
1_svm_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# coding: utf-8
import os
from time import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import feather
# events data
def paths(tokens):
all_paths = ['_'.join(tokens[0:(i+1)]) for i in range(len(tokens))]
return ' '.join(all_paths)
def unwrap_geo(geo):
geo = geo.split('>')
return paths(geo)
df_events = pd.read_csv("../data/events.csv")
df_events.geo_location.fillna('', inplace=1)
geo_str = df_events.geo_location.apply(unwrap_geo)
ts = (df_events.timestamp + 1465876799998) / 1000
df_events.timestamp = pd.to_datetime(ts, unit='s')
dt = df_events.timestamp.dt
dow = 'dow_' + dt.dayofweek.astype('str')
hours = 'hour_' + dt.hour.astype('str')
dow_hour = 'dow_hour_' + dt.dayofweek.astype('str') + '_' + dt.hour.astype('str')
display_str = 'u_' + df_events.uuid + ' ' + \
'd_' + df_events.document_id.astype('str') + ' ' + \
'p_' + df_events.platform.astype('str') + ' ' + \
dow + ' ' + hours + ' ' + dow_hour + ' ' + \
geo_str
df_events_processed = pd.DataFrame()
df_events_processed['display_id'] = df_events.display_id
df_events_processed['display_str'] = display_str
# ad documents data
df_promoted = pd.read_csv("../data/promoted_content.csv")
ad_string = 'addoc_' + df_promoted.document_id.astype('str') + ' ' \
'campaign_' + df_promoted.campaign_id.astype('str') + ' ' \
'adv_' + df_promoted.advertiser_id.astype('str')
df_promoted_processed = pd.DataFrame()
df_promoted_processed['ad_id'] = df_promoted.ad_id
df_promoted_processed['promoted_ad_str'] = ad_string
ad_to_idx = dict(zip(df_promoted_processed.ad_id, df_promoted_processed.index))
# processing data in batches
def prepare_batch(batch):
batch = batch.reset_index(drop=1)
promoted_idx = batch.ad_id.apply(ad_to_idx.get)
promoted_ad_str = df_promoted_processed.promoted_ad_str.iloc[promoted_idx]
display_str = df_events_processed.display_str.iloc[batch.display_id - 1]
promoted_ad_str.reset_index(drop=1, inplace=1)
display_str.reset_index(drop=1, inplace=1)
batch['ad_display_str'] = promoted_ad_str + ' ' + display_str
return batch
def append_to_csv(batch, csv_file):
props = dict(encoding='utf-8', index=False)
if not os.path.exists(csv_file):
batch.to_csv(csv_file, **props)
else:
batch.to_csv(csv_file, mode='a', header=False, **props)
def delete_file_if_exists(filename):
if os.path.exists(filename):
os.remove(filename)
def chunk_dataframe(df, n):
for i in range(0, len(df), n):
yield df.iloc[i:i+n]
# preparing data for train & test
df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
delete_file_if_exists('tmp/svm_features_train.csv')
for batch in tqdm(chunk_dataframe(df_all, n=1000000)):
batch = prepare_batch(batch)
append_to_csv(batch, 'tmp/svm_features_train.csv')
df_test = feather.read_dataframe('tmp/clicks_test.feather')
delete_file_if_exists('tmp/svm_features_test.csv')
for batch in tqdm(chunk_dataframe(df_test, n=1000000)):
batch = prepare_batch(batch)
append_to_csv(batch, 'tmp/svm_features_test.csv')