-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_dataset.py
82 lines (72 loc) · 2.75 KB
/
make_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pandas as pd
import os
from basic_tokenizers import ByteTextTokenizer
import time
from tqdm import tqdm
def read_lines(path):
"""
Tokenizes a text file.
"""
assert os.path.exists(path)
lines = []
with open(path, 'r') as f:
for line in f:
lines.append(line) # + ['<eos>'])
return lines
def convert_data_to_seq_length(df, seq_length=64000):
"""
Take a dataframe text data and convert it to a dataframe with the same columns where
every data sample is of numericalized token length of ~seq_length
(less than but closest to the value given)
:param df: a pandas datafram with columns [text, lens]
:param seq_length: the numericalized token sequence length to split the data into
:return: the new dataframe with split data samples
"""
sum_len = 0
concat_text = ''
result = pd.DataFrame(columns=['text', 'lens'])
for i in tqdm(range(len(df)), desc="splitting data", total=len(df)):
if sum_len + df['lens'].iloc[i] < seq_length:
sum_len += df['lens'].iloc[i]
concat_text += df['text'].iloc[i]
else:
result = result.append(
{
'text': concat_text,
'lens': sum_len,
},
ignore_index=True)
sum_len = df['lens'].iloc[i]
concat_text = df['text'].iloc[i]
return result
def read_and_prepare_data(data_path, seq_length=0):
"""
Read the data from file, and prepare the dataframe.
This does not include splitting into train and validation sets.
:param data_path: relative path to the raw data
:param seq_length: sequence length to split data into, default is don't change data sample length
:return: the dataframe after preparations
"""
print("Reading data from path...")
# Read the data from file
enwik8 = read_lines(data_path)
df = pd.DataFrame({'text': enwik8})
print("Done!")
# By default we won't change the data sample length
if seq_length != 0:
print("Sequence length has been added, "
"splitting data to samples with sequence length " + str(seq_length))
time.sleep(0.5) # this is so the printing of the progress bar is not weird
# Initialize the BTT
btt = ByteTextTokenizer(is_lm=True, add_bos=True, add_eos=True)
tqdm.pandas(desc="preparing data for split")
# Modify dataset for training
df['lens'] = df['text'].progress_map(lambda x: len(btt(x)))
# Convert data samples according to sequence length
df = convert_data_to_seq_length(df, seq_length)
print()
print("Done!")
else:
df['lens'] = df['text'].map(lambda x: len(x))
df['lens_cum_sum'] = df.lens.cumsum()
return df