-
Notifications
You must be signed in to change notification settings - Fork 0
/
preproc2.py
executable file
·85 lines (70 loc) · 3.37 KB
/
preproc2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# this program will walk the folder specified in the "path" variable and count the number of files containing each n-gram atleast "WFO" times
# this program outputs the n-grams found in the files, and there frequency, to multiple output files, one for each batch.
# Each of these files will contain the batch n-gram freuencies sorted by n-gram. They can be recombined using merge sort
#__________________IMPORTS_________________________________________________________
import re
import os
import operator
from collections import defaultdict
#__________________GLOBALS_________________________________________________________
N = 6 #number of bytes in a gram
WFO = 1 #minimum number of within file occurences
FREQ = defaultdict(int) #str:int pairs where str is ngram and int is the num of files said n-gram appears atleast WFO times in
path = 'train/' #relative or absolute path to folder to recursively walk for files to process
batchSize = 100
#__________________FUNCTIONS_________________________________________________________
def nGramFreq(fname, n):
fp = open(fname, "r")
bytes = fp.read() #scan in file as large string
fp.close()
bytes = re.sub(r'([0-9]|[A-F]){8}',"",bytes) #remove line number
bytes = re.sub(r'\n'," ",bytes) #chance newlines to spaces
bytes = re.sub(r' '," ",bytes) #remove double spaces to single spaces
bytes = re.sub(r'\?', "0",bytes) #change ? to 0
tokens = [token for token in bytes.split(" ") if token != ""] #tokenize string, each token is one byte, in range 00 to FF
ngrams = zip(*[tokens[i:] for i in range(n)]) #zip n copies of the token array, where the i-th copy is offset by i-1, each sublist in list returned by zip will be length n
words = ["".join(ngram) for ngram in ngrams] #for each of the sublists, join their contents to obtain an ngram and add it to the list "words"
freq = defaultdict(int) #initialize and populate frequency dict for this file
for i in words:
if i in freq:
freq[i]+=1
else:
freq[i]=1
for i in freq.items(): #update global frequency dict
if i[1] >= WFO:
if i[0] in FREQ:
FREQ[i[0]]+=1
else:
FREQ[i[0]]=1
return
#__________________MAIN_________________________________________________________
print("\n\n\n")
#obtain list of files to process
files = []
for r, d, f in os.walk(path):
for file in f:
if '.bytes' in file:
files.append(os.path.join(r, file))
print(len(files))
#obtain n-grams
FREQ = defaultdict(int)
N = 6 #n-gram length
p = 0 #number of files processed
for f in files:
print(str(N)+"-gram progress:"+str(100*(p/len(files)))[:6]+"%", end="\r", flush=True)
p += 1
nGramFreq(f, N)
if (p % batchSize) == 0: #Save preproc info for batch
fp = open("train"+str(N)+"grams/"+str(int(p/batchSize))+"-"+str(N)+"Grams.csv", "w+")
for key, value in sorted(FREQ.items(), key=lambda item: int(item[0], 16)):#sort by n-gram before saving
fp.write(key+","+str(value)+"\n")
fp.close()
FREQ = defaultdict(int)
#save preproc info for last batch
fp = open("train"+str(N)+"grams/last-"+str(N)+"Grams.csv", "w+")
for key, value in sorted(FREQ.items(), key=lambda item: int(item[0], 16)):#sort by n-gram before saving
fp.write(key+","+str(value)+"\n")
fp.close()
FREQ = defaultdict(int)
print(str(N)+"-gram progress:"+str(100*(p/len(files)))[:6]+"%", flush=True)
print("\nDONE\n\n")