-
Notifications
You must be signed in to change notification settings - Fork 0
/
featEx.py
executable file
·66 lines (54 loc) · 2.62 KB
/
featEx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#this program generates a feature vector as a .csv file (arg 3) from a .byte file(arg 1) and a .csv file(arg 2) which contains the sorted list of ngrams to search for in said byte file
#__________________IMPORTS_________________________________________________________
import re
import os
import io
import sys
import shutil
import operator
import numpy as np
import pandas as pd
#__________________FUNCTIONS_________________________________________________________
def compare(ngram, elem):
if ngram < elem:
return -1
elif ngram > elem:
return 1
else:
return 0
def binarySearch(list, ngram):#If ngram in list, returns index, else returns -1
elements = len(list) #number of elements in current sublist
offset = 0 #offlist of start of current sublist
middle = 0
while elements > 0:
middle = int(elements / 2)
res = compare(int(ngram,16), int(list[offset + middle], 16))#compare search ngram with ngram at offset+middle
if res < 0: #if search ngram less then ngram at offset+middle
elements = middle
elif res > 0: #if search ngram greater then ngram at offset+middle
offset = offset + middle + 1
elements = elements - (middle + 1)
else: #if search ngram equals ngram at offset+middle
return (offset + middle)
return -1
#__________________MAIN_________________________________________________________
N = 6 #ngram size
infp = open(sys.argv[1], "r") #file for feature extraction
grams = pd.read_csv(sys.argv[2], header=None)[0].tolist() #file containing ngrams to be used as features
vec = np.zeros(len(grams), dtype=int) #initialize feature vector
outfp = open(sys.argv[3], "w") #file to save feature vector for input file
bytes = infp.read() #scan in file as large string
infp.close()
bytes = re.sub(r'([0-9]|[A-F]){8}',"",bytes) #remove line number
bytes = re.sub(r'\n'," ",bytes) #change newlines to spaces
bytes = re.sub(r' '," ",bytes) #change double spaces to single spaces
bytes = re.sub(r'\?', "0",bytes) #change ? to 0
tokens = [token for token in bytes.split(" ") if token != ""] #tokenize string, each token is one byte, in range 00 to FF
ngrams = zip(*[tokens[i:] for i in range(N)]) #zip n copies of the token array, where the i-th copy is offset by i-1, each sublist in list returned by zip will be length n
words = ["".join(ngram) for ngram in ngrams] #join each sublist to get a list of ngrams
for w in words: #for each ngram in the input file...
i = binarySearch(grams, w) #search for ngram in frequent ngram list
if i != -1: #if ngram is in frequent ngram list...
vec[i] = 1 #update cooresponding index in feature vector
outfp.write("\n".join(vec.astype(type("")).tolist())) #save feature vector
outfp.close()