-
Notifications
You must be signed in to change notification settings - Fork 11
/
db_patent_stats.py
executable file
·80 lines (68 loc) · 3.34 KB
/
db_patent_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
This is a simple helper script for extracting some statistics
on the patents contained in the DB and plotting them
(e.g. average section length, etc.)
"""
import sqlalchemy
import random
import numpy as np
import collections
import matplotlib.pyplot as plt
from database.make_patent_db import load_session, Patent, Citation
from sqlalchemy.sql.expression import func
from nltk.tokenize import word_tokenize
session = load_session()
patent_sample = session.query(Patent).order_by(func.random()).limit(10000)
def calc_section_length():
section_length = collections.defaultdict(list)
for pat in patent_sample:
section_length['abstract'].append(len(word_tokenize(pat.abstract)))
section_length['claims'].append(len(word_tokenize(pat.claims)))
section_length['description'].append(len(word_tokenize(pat.description)))
section_length_avg = {}
for key in section_length:
section_length_avg['abstract'] = np.mean(np.array(section_length['abstract']))
section_length_avg['claims'] = np.mean(np.array(section_length['claims']))
section_length_avg['description'] = np.mean(np.array(section_length['description']))
np.save('db_statistics/section_length_avg.npy', section_length_avg)
np.save('db_statistics/section_length.npy', section_length)
def plot_section_length():
section_length = np.load('db_statistics/section_length.npy').item()
description = np.array(section_length['description'])
claims = np.array(section_length['claims'])
abstract = np.array(section_length['abstract'])
plt.hist(claims, bins=50, color='r', histtype='step', label='claims')
plt.hist(description, bins=50, color='b', histtype='step', label='descriptions')
plt.hist(abstract, bins=50, color='g', histtype='step', label='abstracts')
plt.axvline(np.median(claims), color='r', linestyle='dashed', linewidth=2)
plt.axvline(np.median(abstract), color='g', linestyle='dashed', linewidth=2)
plt.axvline(np.median(description), color='b', linestyle='dashed', linewidth=2)
plt.gca().set_xscale("log")
plt.legend(loc=1)
plt.xlabel('Number of tokens')
plt.savefig('db_statistics/section_length.pdf')
plt.clf()
claims = np.nan_to_num(np.log(claims))
abstract = np.nan_to_num(np.log(abstract))
description = np.nan_to_num(np.log(description))
from sklearn.neighbors.kde import KernelDensity
bw = 0.001
minval=0.
maxval = np.max(description)
X_plot = np.linspace(minval, maxval, 50)[:, np.newaxis]
kde_claim = KernelDensity(kernel='gaussian', bandwidth=bw).fit(claims[:,np.newaxis])
kde_abstr = KernelDensity(kernel='gaussian', bandwidth=bw).fit(abstract[:,np.newaxis])
kde_description = KernelDensity(kernel='gaussian', bandwidth=bw).fit(description[:,np.newaxis])
log_dens_claim = kde_claim.score_samples(X_plot)
log_dens_abstr = kde_abstr.score_samples(X_plot)
log_dens_description = kde_description.score_samples(X_plot)
plt.plot(X_plot[:,0], np.exp(log_dens_claim), color='g', label='claims')
plt.plot(X_plot[:,0], np.exp(log_dens_abstr), color='y', label='abstracts')
plt.plot(X_plot[:,0], np.exp(log_dens_description), color='b', label='description')
plt.legend()
plt.xlabel('Number of tokens')
plt.savefig('db_statistics/section_length_est.pdf')
plt.clf()
if __name__ == "__main__":
calc_section_length()
plot_section_length()