diff --git a/corenlp_pywrap/example.py b/corenlp_pywrap/example.py index b223086..ac22d74 100644 --- a/corenlp_pywrap/example.py +++ b/corenlp_pywrap/example.py @@ -1,5 +1,6 @@ import pywrap as p -cn = p.CoreNLP() -r = cn.arrange('This is Sherin, He is good but I am bad') -for key, val in r.items(): - print(key, val) \ No newline at end of file +cn = p.CoreNLP(annotator_list = ['lemma']) +sent = '''Well, that is it then. Zimbabwe have thrashed Afghanistan. For a long while, it looked like Afghanistan held all the aces, but they have hurtled towards a heavy defeat. Coming back to the wicket, Hamza tried to nudge this back of a length delivery to fine leg. However, he got a faint edge on it. Mutumbami showed superb reflexes to dive to his left and snaffle the catch. Players from both sides shake hands as they make their way back to the pavilion. Amir Hamza c Mutumbami b Luke Jongwe 1(12)Luke Jongwe to Amir Hamza, THAT'S OUT!! Caught!!''' +r = cn.arrange(sent) +print(len(r['index'])) +print(len(r['word'])) \ No newline at end of file diff --git a/corenlp_pywrap/pywrap.py b/corenlp_pywrap/pywrap.py index c2c077c..eeda435 100644 --- a/corenlp_pywrap/pywrap.py +++ b/corenlp_pywrap/pywrap.py @@ -17,7 +17,6 @@ class CoreNLP: "relation", "natlog", "quote"] url = 'http://127.0.0.1:9000' out_format = 'json' - sentences = [] def __init__(self, url=url, annotator_list=annotator_full_list): assert url.upper().startswith('HTTP'), \ @@ -102,11 +101,10 @@ def regex(cls, endpoint, data, pattern, custom_filter): return cls.server_connection(current_url, data) @staticmethod - def process_sentences(sentence): - assert isinstance(sentence, list), 'it should be a list' - assert len(sentence) == 1, 'assuming the lenght is one' - sent_dict = sentence[0] - tokens = sent_dict['tokens'] + def process_sentences(sentences): + assert isinstance(sentences, list), 'it should be a list' + index = 0 + new_index = 0 token_dict = { 'index':[], 'truecaseText':[], @@ -122,9 +120,16 @@ def process_sentences(sentence): 'word':[], 'after':[] } - for val in tokens: - for key, val in val.items(): - token_dict[key].append(val) + for sentence in sentences: + index = new_index + tokens = sentence['tokens'] + for val in tokens: + for key, val in val.items(): + if key == 'index': + new_index = index + int(val) + token_dict[key].append(str(new_index)) + else: + token_dict[key].append(val) return token_dict