-
Notifications
You must be signed in to change notification settings - Fork 0
/
selbyfreq.py
executable file
·50 lines (46 loc) · 1.83 KB
/
selbyfreq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#this program takes the local path to a .csv file with two columns as cmd line argument one, where the first column's values are ngrams
#and the second column's values are the number of files each ngram appears in. It outputs a .csv file specified by argument 2,
#which contains only the ngrams which appear in at least 100 files along with their corresponding frequencies
#__________________IMPORTS_________________________________________________________
import re
import os
import io
import sys
import shutil
import operator
from collections import defaultdict
#__________________MAIN_________________________________________________________
print("\n\n")
infp = open(sys.argv[1], "r") #open input file
buf = io.StringIO() #outbut buffer object
outfp = open(sys.argv[2], "w") #open output file
line = re.split(',|\n',infp.readline())[:2] #get first line
ngram = line[0] #get first ngram
freq = int(line[1]) #get first ngram's frequency
m = 1
while(1):
if line == "": #If EOF, output buffer contents to output file
buf.seek(0)
shutil.copyfileobj(buf, outfp)
break
if freq >99: #if frequency greater then 99, then add ngram and it's frequency to output buffer
buf.write(ngram+","+str(freq)+"\n")
try: #try to get next line, break if fail
line = re.split(',|\n',infp.readline())[:2]
ngram = line[0]
freq = int(line[1])
except:
print("ERROR: get next line failed")
break
if int(ngram, 16) > int('100000000', 16) * m: #every multiple of Ox1000000 do...
buf.seek(0)
shutil.copyfileobj(buf, outfp) #save buffer to output file containing merged frequencies
buf.close()
buf = io.StringIO()
print("Progress:"+str(100*((int('100000000', 16) * m)/int('FFFFFFFFFFFF',16)))[:6]+"%", end="\n", flush=True)
m+=1
buf.close()
infp.close()
outfp.close()
print("\n")
print("\nProgress:100%", flush=True)