/
vars.json
141 lines (141 loc) · 3.07 KB
/
vars.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
{
"ASSET_TAGS" : {
"D_S" : "doc_split",
"AS_PDF" : "as_pdf",
"TOKENS_NLP" : "stanford_core_nlp_tokenizer",
"DOC_CLOUD_ENTITIES" : "document_cloud_entities",
"DOC_CLOUD_MANIFEST" : "document_cloud_manifest",
"DOC_CLOUD_DOC" : "document_cloud_document",
"ADDRESSES_NLP" : "addresses_everyblock_nlp",
"PAGE_MAP" : "uv_page_map",
"STANFORD_NER_ENTITIES" : "stanford-ner_entities",
"CP_ENTITIES" : "compass_entity_document",
"CP_TOPICS" : "compass_topic_model_document",
"GM_D" : "gensim_dict",
"GM_MM" : "gensim_corpus_mm",
"GM_LSI" : "gensim_lsi_model",
"GM_TOPICS" : "gensim_lsi_topics"
},
"ELASTICSEARCH_MAPPING_STUBS": {
"cp_page_text" : {
"properties" : {
"searchable_text" : {
"type" : "string"
},
"media_id" : {
"type" : "string",
"index" : "not_analyzed"
}
},
"_parent" : {
"type" : "uv_document"
}
}
},
"QUERY_DEFAULTS" : {
"CP_PAGE_TEXT" : {
"bool" : {
"must_not" : [
{
"constant_score" : {
"filter" : {
"missing" : {
"field" : "cp_page_text.media_id"
}
}
}
}
],
"must" : []
}
}
},
"QUERY_KEYS" : {
"filter_terms" : ["index_in_parent"]
},
"MIME_TYPES" : {},
"MIME_TYPE_MAP" : {},
"MIME_TYPE_TASKS" : {
"application/pdf" : [
"PDF.split_pdf_pages.splitPDFPages",
"PDF.extract_pdf_text.extractPDFText",
"Text.preprocess_nlp.preprocessNLP",
"NLP.page_map.generatePageMap",
"NLP.topic_modeler.createGensimObjects",
"NLP.ner_entity_extractor.extractNEREntities",
"PDF.process_metadata.processPDFMetadata"
],
"text/plain" : [
"Text.evaluate_text.evaluateText",
"Text.preprocess_nlp.preprocessNLP",
"NLP.page_map.generatePageMap",
"NLP.topic_modeler.createGensimObjects",
"NLP.ner_entity_extractor.extractNEREntities",
"PDF.process_metadata.processPDFMetadata"
]
},
"METADATA_ASPECTS" : {
"PDF" : [
{
"tag_position" : "^File:\\s*(.*)",
"label" : "Filename",
"type" : "str",
"ideal" : null
},
{
"tag_position" : "^SHA1:\\s*(\\w{40})",
"label" : "SHA1",
"type" : "str",
"ideal" : null
},
{
"tag_position" : "^Size:\\s*(\\d+)\\sbytes",
"label" : "File Size",
"type" : "int",
"ideal" : null
},
{
"tag_position" : "^Version:\\s(.*)",
"label" : "Version",
"type" : "str",
"ideal" : null
},
{
"tag_position" : "^Info Object in version (\\d):",
"label" : "Version Available",
"type" : "int",
"ideal" : null
},
{
"tag_position" :"/Title (.*)",
"label" : "XMP Title",
"type" : "str",
"ideal" : null
},
{
"tag_position" : "/Creator (.*)",
"label" : "XMP Creator",
"type" : "str",
"ideal" : null
},
{
"tag_position" : "/ModDate D:(.*)'00'",
"label" : "XMP ModDate",
"type" : "str",
"ideal" : null
},
{
"tag_position" : "/CreationDate(.*)",
"label" : "XMP CreationDate",
"type" : "str",
"ideal" : null
},
{
"tag_position" : "/Author (.*)",
"label" : "XMP Author",
"type" : "str",
"ideal" : null
}
]
}
}