preproc2.py

# this program will walk the folder specified in the "path" variable and count the number of files containing each n-gram atleast "WFO" times
# this program outputs the n-grams found in the files, and there frequency, to multiple output files, one for each batch.
#	Each of these files will contain the batch n-gram freuencies sorted by n-gram. They can be recombined using merge sort

#__________________IMPORTS_________________________________________________________
import re
import os
import operator
from collections import defaultdict

#__________________GLOBALS_________________________________________________________
N = 6 #number of bytes in a gram
WFO = 1 #minimum number of within file occurences
FREQ = defaultdict(int) #str:int pairs where str is ngram and int is the num of files said n-gram appears atleast WFO times in
path = 'train/' #relative or absolute path to folder to recursively walk for files to process
batchSize = 100
#__________________FUNCTIONS_________________________________________________________
def nGramFreq(fname, n):
    fp = open(fname, "r")
    bytes = fp.read() #scan in file as large string
    fp.close()
    
    bytes = re.sub(r'([0-9]|[A-F]){8}',"",bytes) #remove line number
    bytes = re.sub(r'\n'," ",bytes) #chance newlines to spaces
    bytes = re.sub(r'  '," ",bytes) #remove double spaces to single spaces
    bytes = re.sub(r'\?', "0",bytes) #change ? to 0
    
    tokens = [token for token in bytes.split(" ") if token != ""] #tokenize string, each token is one byte, in range 00 to FF
    ngrams = zip(*[tokens[i:] for i in range(n)]) #zip n copies of the token array, where the i-th copy is offset by i-1, each sublist in list returned by zip will be length n
    words = ["".join(ngram) for ngram in ngrams] #for each of the sublists, join their contents to obtain an ngram and add it to the list "words"
    
    freq =  defaultdict(int) #initialize and populate frequency dict for this file
    for i in words:
	    if i in freq:
		    freq[i]+=1
	    else:
		    freq[i]=1
    
    for i in freq.items(): #update global frequency dict 
	    if i[1] >= WFO:
		    if i[0] in FREQ:
			    FREQ[i[0]]+=1
		    else:
			    FREQ[i[0]]=1
	    
    return


#__________________MAIN_________________________________________________________

print("\n\n\n")

#obtain list of files to process
files = []
for r, d, f in os.walk(path):
    for file in f:
	    if '.bytes' in file:
		    files.append(os.path.join(r, file))
print(len(files))

#obtain n-grams
FREQ = defaultdict(int)
N = 6 #n-gram length
p = 0 #number of files processed
for f in files:
    print(str(N)+"-gram progress:"+str(100*(p/len(files)))[:6]+"%", end="\r", flush=True)
    p += 1
    nGramFreq(f, N)
    if (p % batchSize) == 0: #Save preproc info for batch
	    fp = open("train"+str(N)+"grams/"+str(int(p/batchSize))+"-"+str(N)+"Grams.csv", "w+")
	    for key, value in sorted(FREQ.items(), key=lambda item: int(item[0], 16)):#sort by n-gram before saving
		    fp.write(key+","+str(value)+"\n")
	    fp.close()
	    FREQ = defaultdict(int)

#save preproc info for last batch
fp = open("train"+str(N)+"grams/last-"+str(N)+"Grams.csv", "w+")
for key, value in sorted(FREQ.items(), key=lambda item: int(item[0], 16)):#sort by n-gram before saving
    fp.write(key+","+str(value)+"\n")
fp.close()
FREQ = defaultdict(int)
print(str(N)+"-gram progress:"+str(100*(p/len(files)))[:6]+"%", flush=True)


print("\nDONE\n\n")