Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional data to OpenSearch index. #766

Merged
merged 36 commits into from May 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
4d08a19
Add data_abstract to opensearch index
ItIsJordan Jan 8, 2024
5c67dc8
Fix test_submissions_csv in dashboard_test.py
ItIsJordan Jan 8, 2024
37d1490
Add data abstract search test
ItIsJordan Jan 9, 2024
b2c1a2a
Revert "Fix test_submissions_csv in dashboard_test.py"
ItIsJordan Jan 10, 2024
4cf908a
Add table data description to opensearch index
ItIsJordan Jan 12, 2024
0f60eb1
Update search testing to include table description searching
ItIsJordan Jan 12, 2024
f6437b2
Revert "Add table data description to opensearch index"
ItIsJordan Jan 26, 2024
ea6e78f
Add additional resource descriptions to index
ItIsJordan Jan 26, 2024
040a4dd
Merge branch 'main' into add-additional-desc-to-osindex
ItIsJordan Feb 5, 2024
c6e4920
Update test for resource searching in OpenSearch API
ItIsJordan Feb 6, 2024
5cd2256
Update hard-coded/assumed testing variables
ItIsJordan Feb 7, 2024
7557c76
Update broken test in test_dashboard.py
ItIsJordan Feb 9, 2024
12214c7
Fix unrendered LaTeX in table description
ItIsJordan Feb 9, 2024
882d6a9
Clear table quantity text on records page
ItIsJordan Feb 9, 2024
6445bd9
Refactor data table routes
ItIsJordan Feb 14, 2024
4dc662a
Update table renderer js
ItIsJordan Feb 14, 2024
a9bd63b
Update records_test.py
ItIsJordan Feb 14, 2024
4afcb2f
Fix button query bug
ItIsJordan Feb 23, 2024
e11033b
Revert large file bugfixes to move to new branch (last 6 commits)
ItIsJordan Feb 27, 2024
a07723f
Merge branch 'main' into add-additional-desc-to-osindex
ItIsJordan Feb 27, 2024
40927b3
Fix get json table data url
ItIsJordan Mar 27, 2024
0483fc1
Add resource url generation function
ItIsJordan Apr 2, 2024
a60fad9
Update record_mapping to include resources/data_abstract
ItIsJordan Apr 2, 2024
f982f14
Add function to generate DataResource dictionary
ItIsJordan Apr 2, 2024
d897f87
Add full resource data to OpenSearch index
ItIsJordan Apr 2, 2024
b6596cc
Add full resource data to table output
ItIsJordan Apr 2, 2024
41b9b59
Update search_test.py
ItIsJordan Apr 2, 2024
bde6455
Update advanced search help
ItIsJordan Apr 2, 2024
efad6bc
Merge branch 'main' into add-additional-desc-to-osindex
ItIsJordan Apr 2, 2024
f2d7ef1
Change use of SITE_URL
ItIsJordan Apr 16, 2024
944638e
Add shorthand for resource description searching
ItIsJordan Apr 16, 2024
d26ca1e
Update search help text
ItIsJordan Apr 16, 2024
6824749
Fix broken test
ItIsJordan Apr 16, 2024
9b1a5cc
Add "resources" data to data table json output
ItIsJordan Apr 17, 2024
e24c6e6
Update site_url use
ItIsJordan Apr 17, 2024
2846603
Update search_help.html
ItIsJordan Apr 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 16 additions & 0 deletions hepdata/ext/opensearch/config/record_mapping.py
Expand Up @@ -198,6 +198,22 @@
}
}
},
"resources": {
"properties": {
"description": {
"type": "text"
},
"type": {
"type": "text"
},
"url": {
"type": "text"
}
}
},
"data_abstract": {
"type": "text"
},
"parent_child_join": {
"type": "join",
"relations": {
Expand Down
47 changes: 45 additions & 2 deletions hepdata/ext/opensearch/document_enhancers.py
Expand Up @@ -34,6 +34,8 @@
from hepdata.ext.opensearch.config.record_mapping import mapping as os_mapping
from hepdata.modules.permissions.models import SubmissionParticipant
from hepdata.modules.submission.api import get_latest_hepsubmission
from hepdata.modules.submission.models import DataSubmission
from hepdata.utils.miscellaneous import get_resource_data

FORMATS = ['json', 'root', 'yaml', 'csv', 'yoda']

Expand Down Expand Up @@ -105,8 +107,8 @@ def add_analyses(doc):
if reference.file_type in current_app.config['ANALYSES_ENDPOINTS']:
doc["analyses"].append({'type': reference.file_type, 'analysis': reference.file_location})
elif reference.file_type == HISTFACTORY_FILE_TYPE:
SITE_URL = current_app.config.get('SITE_URL', 'https://www.hepdata.net')
landing_page_url = f"{SITE_URL}/record/resource/{reference.id}?landing_page=true"
site_url = current_app.config.get('SITE_URL', 'https://www.hepdata.net')
landing_page_url = f"{site_url}/record/resource/{reference.id}?landing_page=true"
doc["analyses"].append({'type': reference.file_type, 'analysis': landing_page_url,
'filename': os.path.basename(reference.file_location)})

Expand All @@ -127,6 +129,44 @@ def add_data_keywords(doc):
doc['data_keywords'] = dict(agg_keywords)


def add_data_abstract(doc):
"""
Adds the data abstract from its associated HEPSubmission to the document object

:param doc: The document object
:return:
"""

submission = get_latest_hepsubmission(publication_recid=doc['recid'], overall_status='finished')
doc['data_abstract'] = submission.data_abstract


def add_data_resources(doc):
"""
Triggers resource data generation of a DataSubmission object.
Gets the DataSubmission object, then passes it off for data retrival.

:param doc: The document object
:return:
"""

submission = DataSubmission.query.filter_by(doi=doc["doi"]).one()
doc['resources'] = get_resource_data(submission)


def add_submission_resources(doc):
"""
Triggers resource data generation of a HEPSubmission object.
Gets the HEPSubmission object, then passes it off for data retrival.

:param doc: The document object
:return:
"""

submission = get_latest_hepsubmission(publication_recid=doc['recid'], overall_status='finished')
doc['resources'] = get_resource_data(submission)


def process_cmenergies(keywords):
cmenergies = []
if keywords['cmenergies']:
Expand Down Expand Up @@ -182,13 +222,16 @@ def enhance_data_document(doc):
add_data_table_urls(doc)
add_parent_publication(doc)
add_data_keywords(doc)
add_data_resources(doc)


def enhance_publication_document(doc):
add_id(doc)
add_doc_type(doc, CFG_PUB_TYPE)
add_data_submission_urls(doc)
add_data_abstract(doc)
add_shortened_authors(doc)
process_last_updates(doc)
add_analyses(doc)
add_parent_child_info(doc)
add_submission_resources(doc)
3 changes: 2 additions & 1 deletion hepdata/ext/opensearch/query_builder.py
Expand Up @@ -51,7 +51,8 @@ def parse_query(query_string):
"cmenergies": "data_keywords.cmenergies",
"phrases": "data_keywords.phrases",
"reactions": "data_keywords.reactions",
"analysis": "analyses.type"
"analysis": "analyses.type",
"resources": "resources.description" # Add shorthand for resource description
}
}

Expand Down
4 changes: 3 additions & 1 deletion hepdata/modules/records/api.py
Expand Up @@ -65,7 +65,7 @@
RelatedTable
)
from hepdata.utils.file_extractor import extract
from hepdata.utils.miscellaneous import sanitize_html
from hepdata.utils.miscellaneous import sanitize_html, get_resource_data
from hepdata.utils.users import get_user_from_id
from bs4 import BeautifulSoup
from hepdata_converter_ws_client import Error
Expand Down Expand Up @@ -993,6 +993,8 @@ def process_data_tables(ctx, data_record_query, first_data_id,
"id": submission_record.id, "processed_name": processed_name,
"name": submission_record.name,
"location": submission_record.location_in_publication,
# Generate resource metadata
"resources": get_resource_data(submission_record),
"doi": submission_record.doi,
"description": sanitize_html(
truncate_string(submission_record.description, 20),
Expand Down
1 change: 1 addition & 0 deletions hepdata/modules/records/utils/data_processing_utils.py
Expand Up @@ -261,6 +261,7 @@ def generate_table_headers(table_contents):
"table_license": table_contents["table_license"],
"related_tables" : table_contents["related_tables"],
"related_to_this" : table_contents["related_to_this"],
"resources" : table_contents["resources"],
"review": table_contents["review"],
"associated_files": table_contents["associated_files"],
"keywords": {},
Expand Down
6 changes: 4 additions & 2 deletions hepdata/modules/records/views.py
Expand Up @@ -65,6 +65,7 @@
update_action_for_submission_participant
from hepdata.modules.stats.views import increment
from hepdata.modules.permissions.models import SubmissionParticipant
from hepdata.utils.miscellaneous import get_resource_data

logging.basicConfig()
log = logging.getLogger(__name__)
Expand Down Expand Up @@ -303,9 +304,9 @@ def get_table_data(data_recid, version):
return jsonify(generate_table_data(table_contents))


@blueprint.route('/data/<int:recid>/<int:data_recid>/<int:version>/', defaults={'load_all': 1})
@blueprint.route('/data/<int:recid>/<int:data_recid>/<int:version>/')
@blueprint.route('/data/<int:recid>/<int:data_recid>/<int:version>/<int:load_all>')
def get_table_details(recid, data_recid, version, load_all):
def get_table_details(recid, data_recid, version, load_all=1):
"""
Get the table details of a given datasubmission.

Expand Down Expand Up @@ -336,6 +337,7 @@ def get_table_details(recid, data_recid, version, load_all):
table_contents["table_license"] = generate_license_data_by_id(data_record.file_license)
table_contents["related_tables"] = get_table_data_list(datasub_record, "related")
table_contents["related_to_this"] = get_table_data_list(datasub_record, "related_to_this")
table_contents["resources"] = get_resource_data(datasub_record)
table_contents["doi"] = datasub_record.doi
table_contents["location"] = datasub_record.location_in_publication
table_contents["size"] = size_check["size"]
Expand Down
Expand Up @@ -21,7 +21,7 @@ <h4 class="modal-title">Advanced Search Tips</h4>
target="_new">Elasticsearch documentation</a>.</p>

<div class="well well-small">
<h4>Search on title or abstract</h4>
<h4>Search on title, abstract, or record abstract</h4>
<ul>
<li>Find all data with <em>collisions</em> in the <strong>title</strong>
<br/>
Expand All @@ -41,6 +41,16 @@ <h4>Search on title or abstract</h4>
</li>
</ul>
</li>
<br>

<li>Find all data with <em>"CERN-LHC"</em> in the <strong>data abstract</strong>
<li><i>"Comment" in the submission.yaml file</i></li>
<ul>
<li>
<a href='/search?q=data_abstract:CERN-LHC' target="_new">data_abstract:CERN-LHC</a>
</li>
</ul>
</li>

</ul>
</div>
Expand Down Expand Up @@ -104,6 +114,43 @@ <h4>Search by keywords</h4>

<div class="clearfix"></div>

<div class="well well-small">
<h4>Searching resources by field</h4>
<ul>
<li>Text-based description searching:
<ul>
<li>
<a href='/search?q=resources.description:"Created with hepdata_lib"' target="_new">resources:"Created with hepdata_lib"</a>
</li>
</ul>
</li>
<br>

<li>Resource-type searching:
<ul>
<li>
<a href='/search?q=resources.type:png' target="_new">resources.type:png</a>
</li>
<li>Examples: png, html, github, zenodo etc.</li>
</ul>
</li>
<br>

<li>Searching for specific URLs:
<ul>
<li>
<a href='/search?q=resources.url:atlas.web.cern.ch' target="_new">resources.url:atlas.web.cern.ch</a>
</li>
</ul>
</li>
<li>
<span class="text-muted">Quotes force a full match.</span>
</li>
</ul>
</div>

<div class="clearfix"></div>

<div class="well well-small">
<h4>Other useful searches</h4>
<ul>
Expand Down
45 changes: 45 additions & 0 deletions hepdata/utils/miscellaneous.py
Expand Up @@ -23,6 +23,7 @@
import re

import bleach
from flask import current_app


def splitter(data, predicate):
Expand Down Expand Up @@ -70,3 +71,47 @@ def sanitize_html(value, tags=None, attributes=None, strip=False):
)

return cleaned

def generate_resource_url(resource):
"""
Uses the file_location/ID of a submission object to generate a resource url.
If "http" is at the beginning, will return file_location
Otherwise, will generate a HEPData resource URL

:param resource: DataResource object for generation
:return: The generated URL string
"""
# Determine if file_location is url or not
if resource.file_location.startswith("http"):
# Set url value if it's an external location
url_string = resource.file_location
else:
# If not url, create hepdata.net url using resource ID
site_url = current_app.config.get('SITE_URL', 'https://www.hepdata.net')
url_string = f"{site_url}/record/resource/{resource.id}?landing_page=true"

return url_string


def get_resource_data(submission):
"""
Function to create a dictionary of description, type and url for resources objects.
This dictionary is to be added to the OpenSearch index.
Uses either a DataSubmission, or HEPSubmission, which both contain resource objects.

:param submission: HEPSubmission/DataSubmission object
:return: The resources list (of dictionaries)
"""
resources = []

# Create a dictionary entry for every resource
for s in submission.resources:
resource_data = {
"description": s.file_description,
"type": s.file_type,
"url": generate_resource_url(s)
}

resources.append(resource_data)

return resources
7 changes: 6 additions & 1 deletion tests/conftest.py
Expand Up @@ -156,7 +156,12 @@ def get_identifiers():
{"hepdata_id": "ins1245023", "inspire_id": '1245023',
"title": "High-statistics study of $K^0_S$ pair production in two-photon collisions",
"data_tables": 40,
"arxiv": "arXiv:1307.7457"}
"arxiv": "arXiv:1307.7457"},
{"hepdata_id": "ins2751932", "inspire_id": '2751932',
"title": "Search for pair production of higgsinos in events with two Higgs bosons and missing "
"transverse momentum in $\\sqrt{s}=13$ TeV $pp$ collisions at the ATLAS experiment",
"data_tables": 66,
"arxiv": "arXiv:2401.14922"}
]

@pytest.fixture()
Expand Down
5 changes: 3 additions & 2 deletions tests/dashboard_test.py
Expand Up @@ -476,11 +476,12 @@ def test_submissions_csv(app, admin_idx, load_default_data, identifiers):
site_url = app.config.get('SITE_URL', 'https://www.hepdata.net')
csv_data = get_submissions_csv(user, include_imported=True)
csv_lines = csv_data.splitlines()
assert len(csv_lines) == 3
assert len(csv_lines) == 4
assert csv_lines[0] == 'hepdata_id,version,url,inspire_id,arxiv_id,title,collaboration,creation_date,last_updated,status,uploaders,reviewers'
today = datetime.datetime.utcnow().date().isoformat()
assert csv_lines[1] == f'16,1,{site_url}/record/16,1245023,arXiv:1307.7457,High-statistics study of $K^0_S$ pair production in two-photon collisions,Belle,{today},2013-12-17,finished,,'
assert csv_lines[2] == f'1,1,{site_url}/record/1,1283842,arXiv:1403.1294,Measurement of the forward-backward asymmetry in the distribution of leptons in $t\\bar{{t}}$ events in the lepton+jets channel,D0,{today},2014-08-11,finished,,'
assert csv_lines[3] == f'57,1,{site_url}/record/57,2751932,arXiv:2401.14922,Search for pair production of higgsinos in events with two Higgs bosons and missing transverse momentum in $\sqrt{{s}}=13$ TeV $pp$ collisions at the ATLAS experiment,ATLAS,{today},{today},finished,,'

# Get data without imported records - should be empty (headers only)
csv_data = get_submissions_csv(user, include_imported=False)
Expand Down Expand Up @@ -524,5 +525,5 @@ def test_submissions_csv(app, admin_idx, load_default_data, identifiers):
# Get CSV again - should be uploader and reviewers in line 2 now
csv_data = get_submissions_csv(user, include_imported=True)
csv_lines = csv_data.splitlines()
assert len(csv_lines) == 3
assert len(csv_lines) == 4
assert csv_lines[2] == f'1,1,{site_url}/record/1,1283842,arXiv:1403.1294,Measurement of the forward-backward asymmetry in the distribution of leptons in $t\\bar{{t}}$ events in the lepton+jets channel,D0,{today},2014-08-11,finished,test@test.com (Una Uploader),test2@test.com (Rowan Reviewer) | test@hepdata.net'
8 changes: 4 additions & 4 deletions tests/doi_minter_test.py
Expand Up @@ -307,7 +307,7 @@ def test_generate_dois_for_submission(mock_data_cite_provider, identifiers):
mock_data_cite_provider.reset_mock()
record_information = create_record({})
recid = record_information['recid']
assert recid == 106
assert recid == 173
hep_submission = get_or_create_hepsubmission(recid)
generate_dois_for_submission(recid, recid)
mock_data_cite_provider.assert_not_called()
Expand All @@ -334,9 +334,9 @@ def test_generate_dois_for_submission(mock_data_cite_provider, identifiers):
# Generate DOIs again - should work and call `create` for record, v1, table
generate_dois_for_submission(recid, recid)
mock_data_cite_provider.create.assert_has_calls([
call('10.17182/hepdata.106'),
call('10.17182/hepdata.106.v1'),
call('10.17182/hepdata.106.v1/t1')
call('10.17182/hepdata.173'),
call('10.17182/hepdata.173.v1'),
call('10.17182/hepdata.173.v1/t1')
])
# Should have twice as many get calls as register calls (because get is called by create)
assert mock_data_cite_provider.get.call_count == 6
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/test_dashboard.py
Expand Up @@ -206,7 +206,7 @@ def test_dashboard(live_server, logged_in_browser):
)
assert(response.status_code == 200)
decoded_lines = response.content.decode('utf-8').splitlines()
assert len(decoded_lines) == 4
assert len(decoded_lines) == 5
csv_reader = csv.reader(decoded_lines)
for row in csv_reader:
assert len(row) == 12
Expand Down