Skip to content

Commit

Permalink
added post_process and custom_engine function inputs
Browse files Browse the repository at this point in the history
Former-commit-id: b9c505f
  • Loading branch information
interrogator committed Jun 26, 2015
1 parent b535d15 commit 46bad54
Showing 1 changed file with 33 additions and 11 deletions.
44 changes: 33 additions & 11 deletions corpkit/interrogator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def interrogator(path,
quicksave = False,
add_to = False,
add_pos_to_g_d_option = False,
custom_engine = False,
post_process = False,
**kwargs):

"""
Expand Down Expand Up @@ -85,6 +87,10 @@ def interrogator(path,
whose function matches the regex will be kept, and the tag will not be printed
quicksave : str
save result after interrogation has finished, using str as file name
custom_engine : function
pass a function to process every xml file and return a list of results
post_process : function
pass a function that processes every item in the list of results
** kwargs : just_content_words for dictionary building
more generally, kwargs allows users to pass in earlier interrogation
settings in order to reperform the search.
Expand Down Expand Up @@ -151,6 +157,10 @@ def interrogator(path,
if lemmatise:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr=WordNetLemmatizer()

# put me in a more appropriate spot
if custom_engine is not False:
option = 'z'

# check if we are in ipython
try:
Expand Down Expand Up @@ -205,16 +215,14 @@ def gettag(query, lemmatag = False):
def processwords(list_of_matches):
"""edit matches from interrogations"""

# everything should already be in unicode by this point.
#if not dependency:
#list_of_matches = [unicode(w, 'utf-8', errors = 'ignore') for w in list_of_matches]

# remove commas for pandas csv tokeniser, which i should probably remove soon.
list_of_matches = [w.lower().replace(',', '') for w in list_of_matches]

# remove punct etc.
if translated_option != 'o' and translated_option != 'u':
list_of_matches = [w for w in list_of_matches if re.search(regex_nonword_filter, w)]
if post_process is not False:
list_of_matches = [post_process(m) for m in list_of_matches]
else:
# remove commas for pandas csv tokeniser, which i should probably remove soon.
list_of_matches = [w.lower() for w in list_of_matches]
# remove punct etc.
if translated_option != 'o' and translated_option != 'u':
list_of_matches = [w for w in list_of_matches if re.search(regex_nonword_filter, w)]

list_of_matches.sort()

Expand Down Expand Up @@ -550,6 +558,14 @@ def depnummer(xmldata):
return result

def tabler(subcorpus_names, list_of_dicts, num_rows):
import pandas as pd
cols = []
for subcorp, data in zip(subcorpus_names, list_of_dicts):
col = pd.Series([w for w, v in data.most_common(num_rows)], name = subcorp)
cols.append(col)
word_table = pd.concat(cols, axis = 1)
return word_table

csvdata = [','.join(subcorpus_names)]
# for number of rows of data in table
for i in range(num_rows):
Expand All @@ -564,6 +580,7 @@ def tabler(subcorpus_names, list_of_dicts, num_rows):
csvdata.append(','.join(line))
csv = '\n'.join(csvdata)
word_table = read_csv(StringIO(csv))
word_table = DataFrame(data, index = subcorpus_names)
return word_table

# a few things are off by default:
Expand Down Expand Up @@ -654,6 +671,10 @@ def tabler(subcorpus_names, list_of_dicts, num_rows):
query = r'(?i)\b(' + '|'.join(query) + r')\b'

# dependency option:
elif option.startswith('z') or option.startswith('Z'):
translated_option = 'z'
dependency = True
optiontext = 'Using custom dependency query engine.'
elif option.startswith('i') or option.startswith('I'):
translated_option = 'i'
depnum = True
Expand Down Expand Up @@ -826,7 +847,6 @@ def tabler(subcorpus_names, list_of_dicts, num_rows):
elif 'b' in selection:
print ''
return


allowed_dep_types = ['basic-dependencies', 'collapsed-dependencies', 'collapsed-ccprocessed-dependencies']
while dep_type not in allowed_dep_types:
Expand Down Expand Up @@ -1057,6 +1077,8 @@ def tabler(subcorpus_names, list_of_dicts, num_rows):
filepath = os.path.join(path, subcorpus_name, f)
with open(filepath, "rb") as text:
data = text.read()
if translated_option == 'z':
result_from_file = custom_engine(data)
if translated_option == 'g':
result_from_file = govrole(data)
if translated_option == 'd':
Expand Down

0 comments on commit 46bad54

Please sign in to comment.