modification on arrange() to handle multiple sentences

hhsecond · Aug 12, 2016 · e38526d · e38526d
1 parent 61e9438
commit e38526d
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 13 deletions.
diff --git a/corenlp_pywrap/example.py b/corenlp_pywrap/example.py
@@ -1,5 +1,6 @@
 import pywrap as p
-cn = p.CoreNLP()
-r = cn.arrange('This is Sherin, He is good but I am bad')
-for key, val in r.items():
-	print(key, val)
+cn = p.CoreNLP(annotator_list = ['lemma'])
+sent = '''Well, that is it then. Zimbabwe have thrashed Afghanistan. For a long while, it looked like Afghanistan held all the aces, but they have hurtled towards a heavy defeat. Coming back to the wicket, Hamza tried to nudge this back of a length delivery to fine leg. However, he got a faint edge on it. Mutumbami showed superb reflexes to dive to his left and snaffle the catch. Players from both sides shake hands as they make their way back to the pavilion. Amir Hamza c Mutumbami b Luke Jongwe 1(12)Luke Jongwe to Amir Hamza, THAT'S OUT!! Caught!!'''
+r = cn.arrange(sent)
+print(len(r['index']))
+print(len(r['word']))
diff --git a/corenlp_pywrap/pywrap.py b/corenlp_pywrap/pywrap.py
@@ -17,7 +17,6 @@ class CoreNLP:
     "relation", "natlog", "quote"]
     url = 'http://127.0.0.1:9000'
     out_format = 'json'
-    sentences = []
 
     def __init__(self, url=url, annotator_list=annotator_full_list):        
         assert url.upper().startswith('HTTP'), \
@@ -102,11 +101,10 @@ def regex(cls, endpoint, data, pattern, custom_filter):
         return cls.server_connection(current_url, data)
 
     @staticmethod
-    def process_sentences(sentence):
-        assert isinstance(sentence, list), 'it should be a list'
-        assert len(sentence) == 1, 'assuming the lenght is one'
-        sent_dict = sentence[0]
-        tokens = sent_dict['tokens']
+    def process_sentences(sentences):
+        assert isinstance(sentences, list), 'it should be a list'
+        index = 0
+        new_index = 0
         token_dict = {
         'index':[],
         'truecaseText':[],
@@ -122,9 +120,16 @@ def process_sentences(sentence):
         'word':[],
         'after':[]
         }
-        for val in tokens:
-            for key, val in val.items():
-                token_dict[key].append(val)
+        for sentence in sentences:
+            index = new_index
+            tokens = sentence['tokens']
+            for val in tokens:
+                for key, val in val.items():
+                    if key == 'index':
+                        new_index = index + int(val)
+                        token_dict[key].append(str(new_index))
+                    else:
+                        token_dict[key].append(val)
         return token_dict