Added lock to multiprocessing

ghomasHudson · ghomasHudson · commit c171915824db · 2021-06-07T14:36:42.000+01:00
diff --git a/scrape.py b/scrape.py
@@ -9,10 +9,11 @@
 import sys
 import re
 import csv
-from multiprocessing import Pool
+import glob
+from multiprocessing import Pool, Lock
 
 
-def save_document(doc_id, output_dir, csv_writer):
+def save_document(doc_id, output_dir):
     '''Gather document details'''
 
     # Get document details
@@ -53,7 +54,7 @@ def save_document(doc_id, output_dir, csv_writer):
     # Save content
     open(os.path.join(output_dir, str(doc_id) + ".txt"), 'w').write(doc["content"])
     del doc["content"]
-    csv_writer.writerow(doc)
+    return doc
 
 
 def drawLoadingBar(val, maximum):
@@ -106,7 +107,10 @@ def parse_args(args):
 
     return parser.parse_args(args)
 
+
+
 def main(args):
+    global _process_line
     # Make data dir
     try:
         os.mkdir(args.output_dir)
@@ -141,26 +145,38 @@ def main(args):
                             break
                         drawLoadingBar(currentCount, args.max_per_lang)
                         currentCount += 1
+                        writer.writerow(save_document(d["id"], args.output_dir))
 
-                        save_document(d["id"], args.output_dir, writer)
     elif args.command == "recreate":
-        writers = {}
         lines = list(args.id_file.readlines()[1:])
+
+        for split in ["train", "test", "dev"]:
+            with open(os.path.join(args.output_dir, "labels." + split + ".csv"), 'a') as f:
+                writer = csv.DictWriter(f, fieldnames=["document_id", "author_id", "L1", "english_proficiency"])
+                writer.writeheader()
+
         indexes = []
         def _process_line(tup):
             index, line = tup
             doc_id, split = line.strip().split(",")
-            if split not in writers.keys():
-                f = open(os.path.join(args.output_dir, "labels." + split + ".csv"), 'w')
-                writers[split] = csv.DictWriter(f, fieldnames=["document_id", "author_id", "L1", "english_proficiency"])
-                writers[split].writeheader()
-
-            save_document(doc_id, args.output_dir, writers[split])
-            indexes.append(index)
-            drawLoadingBar(len(indexes), len(lines))
-
-        with Pool(processes=args.agents) as pool:
-            result = pool.map(_process_line, enumerate(lines), 1)
+            doc = save_document(doc_id, args.output_dir)
+            if doc is not None:
+                l.acquire()
+                with open(os.path.join(args.output_dir, "labels." + split + ".csv"), 'a') as f:
+                    writer = csv.DictWriter(f, fieldnames=["document_id", "author_id", "L1", "english_proficiency"])
+                    writer.writerow(doc)
+                indexes.append(index)
+                drawLoadingBar(len(glob.glob(os.path.join(args.output_dir, "*.txt"))), len(lines))
+                l.release()
+
+        def init(l):
+            global lock
+            lock = l
+
+        l = Lock()
+        print("start pool")
+        with Pool(processes=args.num_agents, initializer=init, initargs=(l,)) as pool:
+            _ = pool.map(_process_line, enumerate(lines), 1)
 
 if __name__ == "__main__":
     main(parse_args(sys.argv[1:]))
diff --git a/tests/test_scrape.py b/tests/test_scrape.py
@@ -83,12 +83,12 @@ def test_save_document_normal():
         with tempfile.TemporaryDirectory() as tmpdirname:
             csv_filename = os.path.join(tmpdirname, "test.csv")
             with open(csv_filename, 'w') as f:
-                writer = csv.DictWriter(f, fieldnames=["document_id", "author_id", "L1", "english_proficiency"])
+                # writer = csv.DictWriter(f, fieldnames=["document_id", "author_id", "L1", "english_proficiency"])
                 test_document, test_author = make_mocks("1234", "1234", "hindi")
                 m.get("https://www.italki.com/api/notebook/1234", text=json.dumps(test_document))
                 m.get("https://www.italki.com/api/user/1234", text=json.dumps(test_author))
-                save_document("1234", tmpdirname, writer)
-            assert open(csv_filename).read() == "1234,1234,hindi,0\n"
+                doc = save_document("1234", tmpdirname)
+            assert doc == {"document_id": test_document["data"]["id"], "author_id": test_author["data"]["id"], "L1": "hindi", "english_proficiency": 0}
             assert open(os.path.join(tmpdirname, "1234.txt")).read() == test_document["data"]["content"]
 
 
@@ -100,8 +100,8 @@ def test_save_document_404():
             with open(csv_filename, 'w') as f:
                 writer = csv.DictWriter(f, fieldnames=["document_id", "author_id", "L1", "english_proficiency"])
                 m.get("https://www.italki.com/api/notebook/1234", status_code=404)
-                save_document("1234", tmpdirname, writer)
-            assert open(csv_filename).read() == ""
+                doc = save_document("1234", tmpdirname)
+            assert doc is None
             assert not os.path.isfile(os.path.join(tmpdirname, "1234.txt"))
 
 
@@ -135,15 +135,16 @@ def test_recreate():
             m.get("https://www.italki.com/api/notebook/4", text=json.dumps(test_document4))
             main(SimpleNamespace(
                 command="recreate",
-                agents=1,
+                num_agents=1,
                 output_dir=os.path.join(tmpdirname, "output"),
                 id_file=open(os.path.join(tmpdirname, "test_ids.txt"))
             ))
             assert open(os.path.join(tmpdirname, "output", "1.txt")).read() == test_document1["data"]["content"]
             assert open(os.path.join(tmpdirname, "output", "2.txt")).read() == test_document2["data"]["content"]
             assert open(os.path.join(tmpdirname, "output", "3.txt")).read() == test_document3["data"]["content"]
             assert open(os.path.join(tmpdirname, "output", "4.txt")).read() == test_document4["data"]["content"]
-            assert open(os.path.join(tmpdirname, "output", "labels.train.csv")).read() == "document_id,author_id,L1,english_proficiency\n1,1234,hindi,0\n4,12344,french,5\n"
+            print(open(os.path.join(tmpdirname, "output", "labels.train.csv")).readlines())
+            assert set(open(os.path.join(tmpdirname, "output", "labels.train.csv")).readlines()) == set(["document_id,author_id,L1,english_proficiency\n", "1,1234,hindi,0\n", "4,12344,french,5\n"])
             assert open(os.path.join(tmpdirname, "output", "labels.test.csv")).read() == "document_id,author_id,L1,english_proficiency\n2,1234,hindi,0\n"
             assert open(os.path.join(tmpdirname, "output", "labels.dev.csv")).read() == "document_id,author_id,L1,english_proficiency\n3,1234,hindi,0\n"