Skip to content

Commit

Permalink
Merge pull request #83 from GLAM-Workbench/update
Browse files Browse the repository at this point in the history
Update scripts. Bump version
  • Loading branch information
wragge committed Aug 31, 2023
2 parents 59fbc51 + 9ed6b27 commit bc19495
Show file tree
Hide file tree
Showing 9 changed files with 613 additions and 18 deletions.
3 changes: 2 additions & 1 deletion .jupyter/jupyter_lab_config.py
Expand Up @@ -1129,4 +1129,5 @@
# Should be in the form of an HTTP origin: ws[s]://hostname[:port]
# Default: ''
# c.ServerApp.websocket_url = ''
c.VoilaConfiguration.file_allowlist = '.*\.zip'
c.VoilaConfiguration.file_allowlist = '.*'
c.VoilaConfiguration.template = 'material'
14 changes: 4 additions & 10 deletions .zenodo.json
Expand Up @@ -5,7 +5,7 @@
"related_identifiers": [
{
"scheme": "url",
"identifier": "https://github.com/GLAM-Workbench/trove-newspaper-harvester/tree/v1.3.1",
"identifier": "https://github.com/GLAM-Workbench/trove-newspaper-harvester/tree/v2.0.0",
"relation": "isDerivedFrom",
"resource_type": "software"
},
Expand All @@ -20,29 +20,23 @@
"identifier": "https://glam-workbench.github.io/",
"relation": "isPartOf",
"resource_type": "other"
},
{
"scheme": "url",
"identifier": "https://mybinder.org/v2/zenodo/10.5281/zenodo.3545044/",
"relation": "isSourceOf",
"resource_type": "other"
}
],
"version": "v1.3.1",
"version": "v2.0.0",
"upload_type": "software",
"keywords": [
"Trove",
"newspapers",
"Jupyter",
"GLAM Workbench"
],
"publication_date": "2023-04-06",
"publication_date": "2023-08-31",
"creators": [
{
"orcid": "0000-0001-7956-4498",
"name": "Sherratt, Tim"
}
],
"access_right": "open",
"description": "<p>Current version: <a href=\"https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v1.3.1\">v1.3.1</a></p> <p>The <a href=\"https://pypi.org/project/trove-newspaper-harvester/\">Trove Newspaper &amp; Gazette Harvester Harvester</a> makes it easy to download large quantities of digitised articles from Trove’s newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper &amp; Gazette Harvester will get <strong>everything</strong>.</p> <p>The Jupyter notebooks in this repository use the Trove Newspaper and Gazette Harvester to download large quantities of digitised newspaper articles from Trove. There’s also a few examples of how you can analyse and explore the harvested data.</p> <p>The notebooks include:</p> <ul> <li><strong>Using TroveHarvester to get newspaper articles in bulk</strong> — an easy introduction to the TroveHarvester tool</li> <li><strong>Trove Harvester web app</strong> — a simple web interface to the TroveHarvester, the easiest way to harvest data from Trove (runs in Voila)</li> <li><strong>Harvesting articles that mention “Anzac Day” on Anzac Day</strong> – import the Harvester as a Python library to harvest a complex search</li> <li><strong>Display the results of a harvest as a searchable database using Datasette</strong> – load your harvested data into a SQLite database and explore it using Datasette</li> <li><strong>Exploring your TroveHarvester data</strong> — use Pandas to analyse your data and create some visualisations</li> <li><strong>Explore harvested text files</strong> (experimental) — analyse the full text content of harvested articles</li> </ul> <p>See the <a href=\"https://glam-workbench.github.io/trove-harvester/\">GLAM Workbench for more details</a>.</p> <h2 id=\"cite-as\">Cite as</h2> <p>See the GLAM Workbench or <a href=\"https://doi.org/10.5281/zenodo.3545044\">Zenodo</a> for up-to-date citation details.</p> <hr /> <p>This repository is part of the <a href=\"https://glam-workbench.github.io/\">GLAM Workbench</a>.<br /> If you think this project is worthwhile, you might like <a href=\"https://github.com/sponsors/wragge?o=esb\">to sponsor me on GitHub</a>.</p>"
"description": "<p>Current version: <a href=\"https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v2.0.0\">v2.0.0</a></p> <p>The <a href=\"https://pypi.org/project/trove-newspaper-harvester/\">Trove Newspaper &amp; Gazette Harvester Harvester</a> makes it easy to download large quantities of digitised articles from Trove’s newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper &amp; Gazette Harvester will get <strong>everything</strong>.</p> <p>The Jupyter notebooks in this repository use the Trove Newspaper and Gazette Harvester to download large quantities of digitised newspaper articles from Trove. There’s also a few examples of how you can analyse and explore the harvested data.</p> <p>The notebooks include:</p> <ul> <li><strong>Using TroveHarvester to get newspaper articles in bulk</strong> — an easy introduction to the TroveHarvester tool</li> <li><strong>Trove Harvester web app</strong> — a simple web interface to the TroveHarvester, the easiest way to harvest data from Trove (runs in Voila)</li> <li><strong>Harvesting articles that mention “Anzac Day” on Anzac Day</strong> – import the Harvester as a Python library to harvest a complex search</li> <li><strong>Display the results of a harvest as a searchable database using Datasette</strong> – load your harvested data into a SQLite database and explore it using Datasette</li> <li><strong>Exploring your TroveHarvester data</strong> — use Pandas to analyse your data and create some visualisations</li> <li><strong>Explore harvested text files</strong> (experimental) — analyse the full text content of harvested articles</li> </ul> <p>See the <a href=\"https://glam-workbench.github.io/trove-harvester/\">GLAM Workbench for more details</a>.</p> <h2 id=\"cite-as\">Cite as</h2> <p>See the GLAM Workbench or <a href=\"https://doi.org/10.5281/zenodo.3545044\">Zenodo</a> for up-to-date citation details.</p> <hr /> <p>This repository is part of the <a href=\"https://glam-workbench.github.io/\">GLAM Workbench</a>.<br /> If you think this project is worthwhile, you might like <a href=\"https://github.com/sponsors/wragge?o=esb\">to sponsor me on GitHub</a>.</p>"
}
4 changes: 2 additions & 2 deletions README.md
@@ -1,6 +1,6 @@
# Trove Newspaper and Gazette Harvester

Current version: [v1.3.1](https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v1.3.1)
Current version: [v2.0.0](https://github.com/GLAM-Workbench/trove-newspaper-harvester/releases/tag/v2.0.0)

The [Trove Newspaper & Gazette Harvester Harvester](https://pypi.org/project/trove-newspaper-harvester/) makes it easy to download large quantities of digitised articles from Trove's newspapers and gazettes. Just give it a search from the Trove web interface, and the harvester will save the metadata of all the articles in a CSV (spreadsheet) file for further analysis. You can also save the full text of every article, as well as copies of the articles as JPG images, and even PDFs. While the web interface will only show you the first 2,000 results matching your search, the Newspaper & Gazette Harvester will get **everything**.

Expand Down Expand Up @@ -70,7 +70,7 @@ You can use Docker to run a pre-built computing environment on your own computer
* Create a new directory for this repository and open it from the command line.
* From the command line, run the following command:
```
docker run -p 8888:8888 --name trove-newspaper-harvester -v "$PWD":/home/jovyan/work quay.io/glamworkbench/trove-newspaper-harvester repo2docker-entrypoint jupyter lab --ip 0.0.0.0 --NotebookApp.token='' --LabApp.default_url='/lab/tree/index.ipynb'
docker run -p 8888:8888 --name trove-newspaper-harvester quay.io/glamworkbench/trove-newspaper-harvester repo2docker-entrypoint jupyter lab --ip 0.0.0.0 --ServerApp.token=''
```
* It will take a while to download and configure the Docker image. Once it's ready you'll see a message saying that Jupyter Notebook is running.
* Point your web browser to `http://127.0.0.1:8888`
Expand Down
4 changes: 0 additions & 4 deletions newspaper_harvester_app.ipynb
Expand Up @@ -289,10 +289,6 @@
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"voila": {
"file_allowlist": ".*\\.zip",
"template": "material"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {
Expand Down
68 changes: 68 additions & 0 deletions scripts/extract_metadata.py
@@ -0,0 +1,68 @@
import json
from pathlib import Path
from typing import Any, Dict, List, Optional
import nbformat

AuthorInfo = Dict[str, str]

DEFAULT_AUTHOR = {
"name": "Unknown",
"orcid": "https://orcid.org/0000-0000-0000-0000",
}
CREATORS_KEY = "creators"

LISTIFY = ["author", "object", "input"]

def extract_metadata(metadata):
with open(metadata) as file:
data = json.load(file)
return data

def extract_default_authors(metadata: Path) -> List[AuthorInfo]:
"""Attempts to extract author information from the metadata.json file within
the repository. If none are found, returns a dummy value.
Parameters:
metadata: The path to the metadata file, commonly metadata.json
"""
with open(metadata) as file:
data = json.load(file)

return data.get(CREATORS_KEY, [DEFAULT_AUTHOR])

def listify(value):
if not isinstance(value, list):
return [value]
return value

def extract_notebook_metadata(notebook: Path, keys: Dict[str, Any]) -> Dict[str, Any]:
"""Attempts to extract metadata from the notebook.
Parameters:
notebook: The path to the jupyter notebook
keys: A dictionary of keys to look for in the notebook, and their
corresponding defaults if the key is not found.
Returns:
A dictionary containing the retrieved metadata for each key.
"""
"""
with open(notebook) as file:
data = json.load(file)
metadata = data["metadata"]
result = {}
for key, default in keys.items():
result[key] = metadata.get(key, default)
"""
result = {}
nb = nbformat.read(notebook, nbformat.NO_CONVERT)
metadata = nb.metadata.rocrate
for key, default in keys.items():
if key in LISTIFY:
result[key] = listify(metadata.get(key, default))
else:
result[key] = metadata.get(key, default)
return result

41 changes: 41 additions & 0 deletions scripts/list_imports.py
@@ -0,0 +1,41 @@
from pathlib import Path
import json
import re
import importlib.util
import os.path
import sys

external_imports = ['jupyterlab', 'voila', 'voila-material @ git+https://github.com/GLAM-Workbench/voila-material.git']

python_path = os.path.dirname(sys.executable).replace('bin', 'lib')
#print(python_path)

imports = []
for nb in Path(__file__).resolve().parent.parent.glob('*.ipynb'):
if not nb.name.startswith('.') and not nb.name.startswith('Untitled'):
nb_json = json.loads(nb.read_bytes())
for cell in nb_json['cells']:
for line in cell['source']:
if match := re.search(r'^\s*import ([a-zA-Z_]+)(?! from)', line):
imports.append(match.group(1))
elif match := re.search(r'^\s*from ([a-zA-Z_]+)\.?[a-zA-Z_]* import [a-zA-Z_]+', line):
imports.append(match.group(1))

# print(list(set(imports)))

for imported_mod in list(set(imports)):
try:
module_path = importlib.util.find_spec(imported_mod).origin
except AttributeError:
pass
else:
if module_path:
# print(imported_mod)
# print(module_path)
if 'site-packages' in module_path or python_path in module_path:
external_imports.append(imported_mod)
#print(external_imports)

with Path(Path(__file__).resolve().parent.parent, 'requirements-tocheck.in').open('w') as req_file:
for mod in external_imports:
req_file.write(mod + '\n')
File renamed without changes.

0 comments on commit bc19495

Please sign in to comment.