/
tasks.py
67 lines (64 loc) · 2.49 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from celery import shared_task
from celery import group
import linkrot
from linkrot.downloader import sanitize_url, get_status_code
from urllib.parse import urlparse
from time import sleep
@shared_task(ignore_result=False)
def pdfdata_task(path):
pdf = linkrot.linkrot(path)
metadata = pdf.get_metadata()
for key in metadata:
if "Date" in key:
value = metadata[key]
metadata[key] = '{y}-{mo}-{d} {h}:{m} UTC{th}:{tm}'.format(d=value[8:10],
mo=value[6:8],
y=value[2:6],
h=value[10:12],
m=value[12:14],
s=value[14:16],
th=value[16:19],
tm=value[20:22])
refs = pdf.get_references()
g = group(sort_ref.s(dict(reftype=ref_row.reftype, ref=ref_row.ref)) for ref_row in refs)
res = g()
while not res.ready():
sleep(1)
result_data = list()
for child in res:
result_data.append(child.result)
return {'metadata': metadata, 'result_data': result_data}
@shared_task(ignore_result=False)
def sort_ref(ref_dict):
result = dict(pdfs=[],
urls=[],
arxiv=[],
doi=[],
check = []
)
if ref_dict['reftype'] == 'arxiv':
url = "https://arxiv.org/abs/"+ref_dict['ref']
result['arxiv'].append(url)
elif ref_dict['reftype'] == 'doi':
url = "https://doi.org/"+ref_dict['ref']
result['doi'].append(url)
else:
url = ref_dict['ref']
try:
stat = str(get_status_code(url))
except Exception as ex:
stat = 0
result["check"].append(stat)
if ref_dict['reftype'] == 'url':
host = urlparse(url).hostname
if host and host.endswith("doi.org"):
result['doi'].append(url)
elif host and host.endswith("arxiv.org"):
result['arxiv'].append(url)
else:
if not urlparse(url).scheme:
url = 'https://' + url
result['urls'].append(url)
elif ref_dict['reftype'] == 'pdf':
result['pdfs'].append(url)
return result