/
mask_data.py
231 lines (177 loc) · 10.5 KB
/
mask_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import pandas as pd
from aitextgen.TokenDataset import TokenDataset
from aitextgen.tokenizers import train_tokenizer
from aitextgen.utils import GPT2ConfigCPU
from aitextgen import aitextgen
from faker import Faker
from random import shuffle, sample, randint, random
import ast
import re
class artificial_data(object):
def __init__(self):
# Train a data2text model to generate artificial description of project on the website
file_name = 'dataset_des.txt'
self.ai_text_generator(file_name)
# Load the trained model
self.ai_text = aitextgen(model_folder="trained_model", tokenizer_file="aitextgen.tokenizer.json")
# Use Faker package to generate fake users' information
self.fake = Faker()
# Mask users' features
all_user_features = pd.read_csv('user_feature.csv',index_col=0)
all_user_features = self.user_feature_mask(all_user_features)
all_user_features = self.clean_all_user_features(all_user_features)
all_user_features.to_csv('fake_all_user_features.csv')
# Mask users' activities features
# Join the users' activities datasets into one dataframe as match_df
datasets_list = ['top_click_all.csv','visits_30_user.csv','course_enrolments.csv','order_info.csv']
match_df = self.concat_match_df(datasets_list,all_user_features)
match_df = self.match_df_mask(match_df)
match_df.to_csv('match_df.csv')
def ai_text_generator(self,file_name):
"""Train a data to text model to generate artificial text for project description, and save the model
as file.
Args: file name of input text dataset
Returns: None
"""
# Train a custom BPE Tokenizer on the downloaded text
# This will save one file: `aitextgen.tokenizer.json`, which contains the
# information needed to rebuild the tokenizer.
train_tokenizer(file_name)
tokenizer_file = "aitextgen.tokenizer.json"
# GPT2ConfigCPU is a mini variant of GPT-2 optimized for CPU-training
config = GPT2ConfigCPU()
# Instantiate aitextgen using the created tokenizer and config
ai = aitextgen(tokenizer_file=tokenizer_file, config=config)
# Build datasets for training by creating TokenDatasets,
# which automatically processes the dataset with the appropriate size.
data = TokenDataset(file_name, tokenizer_file=tokenizer_file, block_size=64)
# Train the model. It will save pytorch_model.bin periodically and after completion to the `trained_model` folder.
# On a 2019 Macbook Pro, this took ~50 minutes to run.
ai.train(data, batch_size=8, num_steps=50000, generate_every=5000, save_every=5000)
return None
def user_feature_mask(self,all_user_features):
"""Masks the user's information with fake data. Meanwhile, computes resampling of data
because of the sparsity of the dataset.
Args: (pandas.dataframe) users features data
Returns: (pandas.dataframe) masked users features data
"""
# fake username of the email address
all_user_features['email'] = [self.fake.user_name()+ '@' + all_user_features['email'].values[i].split('@')[1]
for i in range(0,all_user_features.shape[0])]
# fake username, first_name, last_name
all_user_features['username'] = [self.fake.user_name() for _ in range(0,all_user_features.shape[0])]
all_user_features['first_name'] = [self.fake.first_name() for _ in range(0,all_user_features.shape[0])]
all_user_features['last_name'] = [self.fake.last_name() for _ in range(0,all_user_features.shape[0])]
# shuffle and resample company, job_title, education, university, major
all_user_features['company'] = self.shuffle_list(list(all_user_features['company']))
all_user_features['job_title'] = sample(list(all_user_features['job_title'])+
list(set(all_user_features['job_title']))+
[self.fake.job() for _ in range(40)],k=all_user_features.shape[0])
all_user_features['education'] = sample(list(all_user_features['education'])+
list(set(all_user_features['education']))*20,
k=all_user_features.shape[0])
all_user_features['university'] = sample(list(all_user_features['university'])+
list(set(all_user_features['university']))*5,
k=all_user_features.shape[0])
all_user_features['major'] = sample(list(all_user_features['major'])+
list(set(all_user_features['major']))*5,
k=all_user_features.shape[0])
# shuffle cols 11 - 40, 42 - 47
for i in list(range(11,41))+list(range(42,48)):
all_user_features.iloc[:,i] = self.shuffle_list(list(all_user_features.iloc[:,i]))
# project_list, randomly append 200 project information in the null rows
all_user_features[['project_list']] = all_user_features[['project_list']].applymap(self.literal_return)
notnull_row_origin = list(all_user_features.loc[all_user_features['project_list'].notnull()].index)
self.random_project_list(all_user_features,200)
# num_follower, append random num to those users who is appended random project_list on previous step
notnull_row = list(all_user_features.loc[all_user_features['project_list'].notnull()].index)
append_row = list(set(notnull_row)-set(notnull_row_origin))
all_user_features.loc[append_row,'num_follower'] = [randint(1,50) for _ in range(len(append_row))]
# impact score, append impact score to those users who is appended random project_list on previous step
# impact score range: 0 - 10
all_user_features.loc[append_row,'impact_score'] = [random()*10 for _ in range(len(append_row))]
return all_user_features
def shuffle_list(self,df_col):
shuffle(df_col)
return df_col
def random_project_list(self,all_user_features,num):
""" Creats augmented project list for null value in features of project_list.
Args: (pandas.dataframe) users features data
(int) number of rows that need to append project list
Returns: None
"""
# Random num of project in each list (range: 1-10)
# project id is random number from 0 to 2000
# project title and description generated by ai text generater, title with max_length 15
# description with max_length 150
# programing language is randomly chosen from a list of programing software
programing = ['Python','R','C++','C','Other','Java',None]
null_row = list(all_user_features.loc[all_user_features['project_list'].isnull()].index)
random_append_index = sample(null_row,k = num)
for i in random_append_index:
project_num = randint(1,10)
project_list = []
for _ in range(project_num):
info = (randint(0,2000),self.ai_text.generate(1,max_length = 15,return_as_list=True)[0],
self.ai_text.generate(1,max_length = 150,return_as_list=True)[0],
sample(programing,k=1)[0])
project_list.append(info)
all_user_features.loc[i,'project_list'] = project_list
return None
def clean_all_user_features(self,all_user_features):
# Clean the text data and turn them into lower case
clean_list = ['company','job_title','education','major','university']
for l in clean_list:
all_user_features[l] = all_user_features[l].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)).lower())
return all_user_features
def literal_return(val):
# Turn the string into list syntax
try:
return ast.literal_eval(val)
except (ValueError, SyntaxError) as e:
return None
def concat_match_df(self,datasets_list,all_user_features):
"""Combine several dataframes of user behavior into one, and clean the dataset.
Args: (list) list of user behavior datasets
(pandas.dataframe) users features data
Return: (pandas.dataframe) dataframe with user behavior data
"""
match_df = pd.DataFrame()
for file in datasets_list:
df = pd.read_csv(file, index_col=0)
match_df = pd.concat([match_df,df],axis=1)
# Select uid corresponding with all_user_features
match_df = match_df.loc[list(set(all_user_features.index)&set(match_df.index)),:]
# Clean the text data
clean_list = ['all_enrol','all_visits','all_orders']
for l in clean_list:
match_df[[l]] = match_df[[l]].applymap(self.str_to_list)
return match_df
def str_to_list(string):
# Turn the string into list syntax
if str(string) not in ['NaN','nan','None']:
return string.strip('][').split(', ')
else:
return None
def match_df_mask(self,match_df):
"""Computes resampling of the match data.
Args: (pandas.dataframe) dataframe with user behavior data
Returns: (pandas.dataframe) dataframe with augmented user behavior data
"""
# expand purchase list
order_num = [randint(200,300) for _ in range(50)]
shuffle_order = list(set(sample(list(match_df[match_df.all_orders.isnull()].index),k=170)))
match_df.loc[shuffle_order,'all_orders'] = sample(order_num*5,k=len(shuffle_order))
match_df.loc[shuffle_order,'purchased_entity'] = match_df.loc[shuffle_order,'all_orders']
# expand visits list
shuffle_visit = list(set(sample(list(match_df[match_df.all_visits.isnull()].index),k=280)))
visit_info = sample(list(match_df[match_df.all_visits.notnull()].index)*7,k=len(shuffle_visit))
for col in ['url','time','time_period','all_visits']:
match_df.loc[shuffle_visit,col] = list(match_df.loc[visit_info,col])
# expand enrollment
shuffle_enrol = list(set(sample(list(match_df[match_df.all_enrol.isnull()].index),k=160)))
enrol_info = list(set(match_df[match_df.all_enrol.notnull()]['enrolment']))
match_df.loc[shuffle_enrol,'enrolment'] = sample(enrol_info*7,k=len(shuffle_enrol))
match_df.loc[shuffle_enrol,'all_enrol'] = match_df.loc[shuffle_enrol,'enrolment']
return match_df
fake_data = artificial_data()