/
dhlab_v2.py
358 lines (155 loc) · 7.19 KB
/
dhlab_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import requests
import pandas as pd
BASE_URL = "https://api.nb.no/ngram/db2"
BASE_URL1 = "https://api.nb.no/ngram/db1"
pd.options.display.max_rows = 100
def get_reference(corpus = 'digavis', from_year = 1950, to_year = 1955, lang = 'nob', limit = 100000):
params = locals()
r = requests.get(BASE_URL + "/reference_corpus", params = params)
if r.status_code == 200:
result = r.json()
else:
result = []
return pd.DataFrame(result, columns = ['word', 'freq']).set_index('word')
def find_urns(docids = None, mode = 'json'):
""" Return a list of URNs from a list of docids as a dictionary {docid: URN} or as a pandas dataframe"""
params = locals()
r = requests.post(BASE_URL1 + "/find_urn", json = params)
if r.status_code == 200:
res = pd.DataFrame.from_dict(r.json(), orient = 'index', columns = ['urn'])
else:
res = pd.DataFrame()
return res
def ngram_book(word = ['.'], title = None, period = None, publisher = None, lang=None, city = None, ddk = None, topic = None):
"""Get a time series for a word as string, title is name of book period is (year, year), lang is three letter iso code.
Use % as wildcard where appropriate - no wildcards in word and lang"""
params = locals()
if isinstance(word, str):
# assume a comma separated string
word = [w.strip() for w in word.split(',')]
params['word'] = tuple(word)
params = {x:params[x] for x in params if not params[x] is None}
r = requests.post(BASE_URL1 + "/ngram_book", json = params)
#print(r.status_code)
df = pd.DataFrame.from_dict(r.json(), orient = 'index')
df.index = df.index.map(lambda x: tuple(x.split()))
columns = df.index.levels[0]
df = pd.concat([df.loc[x] for x in columns], axis = 1)
df.columns = columns
#df.index = df.index.map(pd.Timestamp)
return df
def ngram_periodicals(word = ['.'], title = None, period = None, publisher = None, lang=None, city = None, ddk = None, topic = None):
"""Get a time series for a word as string, title is name of periodical period is (year, year), lang is three letter iso code.
Use % as wildcard where appropriate - no wildcards in word and lang"""
params = locals()
if isinstance(word, str):
# assume a comma separated string
word = [w.strip() for w in word.split(',')]
params['word'] = tuple(word)
params = {x:params[x] for x in params if not params[x] is None}
r = requests.post(BASE_URL1 + "/ngram_periodicals", json = params)
#print(r.status_code)
df = pd.DataFrame.from_dict(r.json(), orient = 'index')
df.index = df.index.map(lambda x: tuple(x.split()))
columns = df.index.levels[0]
df = pd.concat([df.loc[x] for x in columns], axis = 1)
df.columns = columns
#df.index = df.index.map(pd.Timestamp)
return df
def ngram_news(word = ['.'], title = None, period = None):
""" get a time series period is a tuple of (year, year), (yearmonthday, yearmonthday)
word is string and title is the title of newspaper, use % as wildcard"""
params = locals()
if isinstance(word, str):
# assume a comma separated string
word = [w.strip() for w in word.split(',')]
params['word'] = tuple(word)
params = {x:params[x] for x in params if not params[x] is None}
r = requests.post(BASE_URL1 + "/ngram_newspapers", json = params)
#print(r.status_code)
df = pd.DataFrame.from_dict(r.json(), orient = 'index')
df.index = df.index.map(lambda x: tuple(x.split()))
columns = df.index.levels[0]
df = pd.concat([df.loc[x] for x in columns], axis = 1)
df.columns = columns
#df.index = df.index.map(pd.Timestamp)
return df
def get_document_frequencies(urns = None, cutoff = 0):
params = locals()
r = requests.post(BASE_URL1 + "/frequencies", json = params)
result = r.json()
structure = {u[0][0] : dict([tuple(x[1:]) for x in u]) for u in result if u != []}
df = pd.DataFrame(structure)
return df.sort_values(by = df.columns[0], ascending = False)
def get_document_corpus(**kwargs):
return document_corpus(**kwargs)
def document_corpus(doctype = None, author = None, from_year = None, to_year = None, from_timestamp = None, to_timestamp = None, title = None, ddk = None, subject = None, lang = None, limit = None):
""" Fetch a corpus based on metadata - doctypes are digibok, digavis, digitidsskrift"""
parms = locals()
params = {x:parms[x] for x in parms if not parms[x] is None }
if "ddk" in params:
params["ddk"] = params['ddk'].replace('.', '"."')
r=requests.post(BASE_URL + "/build_corpus", json=params)
return pd.DataFrame(r.json())
def urn_collocation(urns = None, word = 'arbeid', before = 5, after = 0, samplesize = 200000):
""" Create a collocation from a list of URNs - returns distance (sum of distances and bayesian distance) and frequency"""
params = {
'urn': urns,
'word': word,
'before': before,
'after': after,
'samplesize': samplesize
}
r = requests.post(BASE_URL1 + "/urncolldist_urn", json = params)
return pd.read_json(r.text)
def totals(n = 50000):
""" Get total frequencies of words in database"""
r = requests.get(BASE_URL + "/totals/{n}".format(n = n))
return pd.DataFrame.from_dict(dict(r.json()),orient = 'index', columns = ['freq'])
def concordance(urns = None, words = None, window = 25, limit = 100):
""" Get a list of concordances from database, words is an fts5 string search expression"""
if words is None:
return {}
else:
params = {
'urns': urns,
'query': words,
'window': window,
'limit': limit
}
r = requests.post(BASE_URL + "/conc", json = params)
return pd.DataFrame(r.json())
def concordance_counts(urns = None, words = None, window = 25, limit = 100):
""" Get a list of concordances from database, words is an fts5 string search expression"""
if words is None:
return {}
else:
params = {
'urns': urns,
'query': words,
'window': window,
'limit': limit
}
r = requests.post(BASE_URL + "/conccount", json = params)
return pd.DataFrame(r.json())
def konkordans(urns = None, query = None, window = 25, limit = 100):
if query is None:
return {}
else:
params = {
'urns': urns,
'query': query,
'window': window,
'limit': limit
}
r = requests.post(BASE_URL + "/conc", json = params)
return pd.DataFrame(r.json())
def collocation(corpusquery = 'norge', word = 'arbeid', before = 5, after = 0):
params = {
'metadata_query': corpusquery,
'word': word,
'before': before,
'after': after
}
r = requests.post(BASE_URL1 + "/urncolldist", json = params)
return pd.read_json(r.text)