-
Notifications
You must be signed in to change notification settings - Fork 28
/
prep_data.py
29 lines (20 loc) · 1.05 KB
/
prep_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pandas as pd
import glob
## load reddit comment data
# https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe
all_files = glob.glob("raw_data/*.csv")
li = []
for filename in all_files:
dff = pd.read_csv(filename, index_col=None, header=0)
li.append(dff)
df = pd.concat(li, axis=0, ignore_index=True)
## put data into comment [SEP] reply format for gpt fine-tuning
(df['comment']+' [SEP] '+df['reply'] ).to_csv('gpt2_finetune.csv', index=False, header=True)
## store comments to be seeds for creating a fake reply dataset for BERT classifier
df['comment'].drop_duplicates().to_csv('gpt2_generate.csv', index=False, header=True)
## generate the datasets for BERT classifiers
# all of these are marked as real=1, the fakes from GPT2 will be real=0
df['real']=1
# bin the upvote scores into 4 bins
df['rating'] = df['reply_score'].apply(lambda x: '0' if x<1 else '1' if x==1 else '2' if x<5 else '3')
df[['comment','reply','real','rating']].to_csv('bert_gan_real.csv', index=False, header=True)