Skip to content

Commit

Permalink
Version 3.21rc0
Browse files Browse the repository at this point in the history
  • Loading branch information
mborsetti committed Apr 15, 2024
1 parent 2beeabd commit 90db268
Show file tree
Hide file tree
Showing 13 changed files with 290 additions and 127 deletions.
169 changes: 85 additions & 84 deletions .github/workflows/ci-cd.yaml
Expand Up @@ -225,94 +225,95 @@ jobs:
COVERALLS_FLAG_NAME: tests-${{ matrix.python-version }}-${{ matrix.os }}
COVERALLS_PARALLEL: true

# NOTES FROM 02-09-2023:
# NOTES FROM 15-Apr-2024:
# No need to install poppler, tesseract etc. (tests will skip)
# However, pytest would simply crash at collecting (Error: Process completed with exit code 1) so no good.

test_windows:
# No redis (only works on ubuntu), only run on latest python-version
name: "Test ${{ matrix.python-version }} on ${{ matrix.os }}"
runs-on: ${{ matrix.os }}
# Identifies any jobs that must complete successfully before this job will run
# needs: [pre-commit]
# A strategy creates a build matrix for your jobs. You can define different variations to run each job in
strategy:
matrix:
# Python versions at https://github.com/actions/python-versions/releases
python-version: ['3.12']
os: [windows-latest]

env:
commitmsg: ${{ github.event.head_commit.message }} # only available at check-out; becomes env.commitmsg
TELEGRAM_TOKEN: ${{ secrets.telegram_token }} # for telegram testing
REDIS_URI: redis://localhost:6379

steps:
# Checks out a copy of the repository per https://github.com/actions/checkout
- name: Check out repo
uses: actions/checkout@main

# Build Python and packages per https://github.com/actions/setup-python
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@main
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
cache-dependency-path: |
requirements.txt
tests/requirements_pytest.txt
- name: Upgrade pip etc.
run: |
pip install --upgrade pip setuptools wheel
# ISSUE WITH THE BELOW:
# Hangs at start /wait "" Miniconda3-latest-Windows-x86_64.exe /RegisterPython=0 /D=%temp%\Miniconda3
# - name: Install pdf2text and ocr dependencies (Python 3.11)
# # do full install and testing of pdf2text and ocr only on latest Python version
# if: matrix.python-version == '3.11'
# Was doing the same on 02-Sep-2023

# test_windows:
# # No redis (only works on ubuntu), only run on latest python-version
# name: "Test ${{ matrix.python-version }} on ${{ matrix.os }}"
# runs-on: ${{ matrix.os }}
# # Identifies any jobs that must complete successfully before this job will run
# # needs: [pre-commit]
# # A strategy creates a build matrix for your jobs. You can define different variations to run each job in
# strategy:
# matrix:
# # Python versions at https://github.com/actions/python-versions/releases
# python-version: ['3.12']
# os: [windows-latest]
#
# env:
# commitmsg: ${{ github.event.head_commit.message }} # only available at check-out; becomes env.commitmsg
# TELEGRAM_TOKEN: ${{ secrets.telegram_token }} # for telegram testing
# REDIS_URI: redis://localhost:6379
#
# steps:
# # Checks out a copy of the repository per https://github.com/actions/checkout
# - name: Check out repo
# uses: actions/checkout@main
#
# # Build Python and packages per https://github.com/actions/setup-python
# - name: Set up Python ${{ matrix.python-version }}
# uses: actions/setup-python@main
# with:
# python-version: ${{ matrix.python-version }}
# cache: 'pip'
# cache-dependency-path: |
# requirements.txt
# tests/requirements_pytest.txt
#
# - name: Upgrade pip etc.
# run: |
# cd %temp%
# dir
# rem # install conda per https://conda.io/projects/conda/en/stable/user-guide/install/windows.html
# curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
# powershell Get-FileHash Miniconda3-latest-Windows-x86_64.exe -Algorithm SHA256
# echo hash should match the one at https://docs.conda.io/projects/miniconda/en/latest/
# rem # below, /S = silent mode
# start /wait "" Miniconda3-latest-Windows-x86_64.exe /RegisterPython=0 /D=%temp%\Miniconda3
# del Miniconda3-latest-Windows-x86_64.exe /f
# call "%temp%\Miniconda3\conda" install -y -c conda-forge poppler
# rem # *** get latest tesseract filename from https://digi.bib.uni-mannheim.de/tesseract/?C=M;O=D
# set TESSERACT=tesseract-ocr-w64-setup-5.3.1.20230401.exe
# curl https://digi.bib.uni-mannheim.de/tesseract/%TESSERACT% -O
# start /wait "" %TESSERACT% /S
# del %TESSERACT% /f
# pip install --upgrade keyring pdftotext Pillow pytesseract
# shell: cmd

- name: Install all other dependencies
# if: matrix.python-version <= '3.11'
run: |
pip install --upgrade coveralls -r requirements.txt -r tests/requirements_pytest.txt
# - name: Install all other dependencies (Python 3.12)
# if: matrix.python-version > '3.11'
# pip install --upgrade pip setuptools wheel
#
## ISSUE WITH THE BELOW:
## Hangs at start /wait "" Miniconda3-latest-Windows-x86_64.exe /RegisterPython=0 /D=%temp%\Miniconda3
## - name: Install pdf2text and ocr dependencies (Python 3.11)
## # do full install and testing of pdf2text and ocr only on latest Python version
## if: matrix.python-version == '3.11'
## run: |
## cd %temp%
## dir
## rem # install conda per https://conda.io/projects/conda/en/stable/user-guide/install/windows.html
## curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
## powershell Get-FileHash Miniconda3-latest-Windows-x86_64.exe -Algorithm SHA256
## echo hash should match the one at https://docs.conda.io/projects/miniconda/en/latest/
## rem # below, /S = silent mode
## start /wait "" Miniconda3-latest-Windows-x86_64.exe /RegisterPython=0 /D=%temp%\Miniconda3
## del Miniconda3-latest-Windows-x86_64.exe /f
## call "%temp%\Miniconda3\conda" install -y -c conda-forge poppler
## rem # *** get latest tesseract filename from https://digi.bib.uni-mannheim.de/tesseract/?C=M;O=D
## set TESSERACT=tesseract-ocr-w64-setup-5.3.1.20230401.exe
## curl https://digi.bib.uni-mannheim.de/tesseract/%TESSERACT% -O
## start /wait "" %TESSERACT% /S
## del %TESSERACT% /f
## pip install --upgrade keyring pdftotext Pillow pytesseract
## shell: cmd
#
# - name: Install all other dependencies
## if: matrix.python-version <= '3.11'
# run: |
# pip install --upgrade greenlet==3.0.0a1 coveralls -r requirements.txt -r tests/requirements_pytest.txt

- name: Run tests
# workaround for Windows fatal exception: access violation
# python -m required to get it to run in the correct directory; '>' folded style scalar (allows splitting line)
run: >
python -m pytest -v --cov=./ --cov-report=term --cov-report=xml --cov-config=.coveragerc tests/ -p
no:faulthandler
- name: Upload coverage data to coveralls.io (parallel)
run: coveralls --service=github
env:
GITHUB_TOKEN: ${{ secrets.github_token }}
COVERALLS_FLAG_NAME: tests-${{ matrix.python-version }}-${{ matrix.os }}
COVERALLS_PARALLEL: true
# pip install --upgrade coveralls -r requirements.txt -r tests/requirements_pytest.txt
#
## - name: Install all other dependencies (Python 3.12)
## if: matrix.python-version > '3.11'
## run: |
## pip install --upgrade greenlet==3.0.0a1 coveralls -r requirements.txt -r tests/requirements_pytest.txt
#
# - name: Run tests
# # workaround for Windows fatal exception: access violation
# # python -m required to get it to run in the correct directory; '>' folded style scalar (allows splitting line)
# run: >
# python -m pytest -v --cov=./ --cov-report=term --cov-report=xml --cov-config=.coveragerc tests/ -p
# no:faulthandler
#
# - name: Upload coverage data to coveralls.io (parallel)
# run: coveralls --service=github
# env:
# GITHUB_TOKEN: ${{ secrets.github_token }}
# COVERALLS_FLAG_NAME: tests-${{ matrix.python-version }}-${{ matrix.os }}
# COVERALLS_PARALLEL: true

coveralls:
name: Completion -> coveralls.io
Expand Down
48 changes: 30 additions & 18 deletions README.rst
Expand Up @@ -9,10 +9,12 @@
webchanges |downloads|
======================

**webchanges** checks web content and notifies you via email (or one of many other `supported services
<https://webchanges.readthedocs.io/en/stable/introduction.html#reporters-list>`__) if a change is detected.
**webchanges** can also check the output of local commands. The notification includes the changed URL or
command and a summary (diff) of what has changed.
**webchanges** checks web content including images, and notifies you via email (or one of many other `supported
services <https://webchanges.readthedocs.io/en/stable/introduction.html#reporters-list>`__) if a change is detected.
**webchanges** can also check the output of local commands.

The notification includes the changed URL or command and a summary (diff) of what has changed, with an optional
summary generated by AI (BETA).

**webchanges** *anonymously* alerts you of web changes.

Expand All @@ -26,6 +28,10 @@ You should use the latest version of `Python <https://www.python.org/downloads/>
Python versions are supported for 3 years after being obsoleted by a new major release (3.x). For each major release,
only the latest bug and security fix version (3.x.y) is supported.

To use Generative AI summaries (BETA) you need a free `API Key from Google Cloud AI Studio
<https://aistudio.google.com/app/apikey>`__ (see `here
<https://webchanges.readthedocs.io/en/stable/differs.html#ai-google>`__).


Installation
============
Expand Down Expand Up @@ -117,26 +123,32 @@ licensed under a `BSD 3-Clause License
license `here <https://github.com/mborsetti/webchanges/blob/main/LICENSE>`__.


Compatibility with **urlwatch**
================================
Compatibility with and improvements from **urlwatch**
=====================================================

This project is based on code from `urlwatch 2.21
<https://github.com/thp/urlwatch/tree/346b25914b0418342ffe2fb0529bed702fddc01f>`__ dated 30 July 2020. You can
easily upgrade from the current version of **urlwatch** (see `here
<https://webchanges.readthedocs.io/en/stable/migration.html>`__) using the same job and configuration files and
benefit from many HTML-focused improvements, including:

* Report links that are `clickable <https://pypi.org/project/webchanges/>`__!
* Original formatting such as **bolding / headers**, *italics*, :underline:`underlining`, list bullets (•) and
indentation;
* :additions:`Added` and :deletions:`deleted` lines clearly highlighted by color and strikethrough, and long lines that
wrap around;
* Correct rendering by email clients who override stylesheets (e.g. Gmail);
* Other legibility improvements;
easily upgrade to **webchanges** from the current version of **urlwatch** using the same job and configuration files
(see `here <https://webchanges.readthedocs.io/en/stable/migration.html>`__) and benefit from many improvements,
including:

* Summary of changes in plain text using Generative AI, useful for long, boring, legal documents;
* Depicting changes to an image;
* Element-by-element changes of JSON or XML data;
* Much better `documentation <https://webchanges.readthedocs.io/>`__;
* Many improvements to HTML reports, including:

* Links that are `clickable <https://pypi.org/project/webchanges/>`__!
* Retaining of original formatting such as **bolding / headers**, *italics*, :underline:`underlining`, list bullets
(•) and indentation;
* :additions:`Added` and :deletions:`deleted` lines clearly highlighted by color and strikethrough, and long lines
that wrap around;
* Correct rendering by email clients who override stylesheets (e.g. Gmail);
* Other legibility improvements;

* New filters such as `additions_only <https://webchanges.readthedocs.io/en/stable/diff_filters.html#additions-only>`__,
which makes it easier to track content that was added without the distractions of the content that was deleted;
* New features such as ``--errors`` to catch jobs that no longer work;
* Much better `documentation <https://webchanges.readthedocs.io/>`__;
* More reliability and stability, including a ~30 percentage point increase in testing coverage;
* Many other additions, refinements and fixes (see `detailed information
<https://webchanges.readthedocs.io/en/stable/migration.html#upgrade-details>`__).
Expand Down
5 changes: 3 additions & 2 deletions docs/cli_help.txt
Expand Up @@ -14,8 +14,9 @@ usage: webchanges [-h] [-V] [-v] [--jobs FILE] [--config FILE] [--hooks FILE]
[--max-snapshots NUM_SNAPSHOTS] [--add JOB] [--delete JOB]
[JOB(S) ...]

Checks web content to detect any changes since the prior run. If any are found,
it shows what changed ('diff') and/or sends it via email and/or other supported
Checks web content, including images, to detect any changes since the prior
run. If any are found, it summarizes (including with Gen AI) what changed
('diff') and displays it and/or sends it via email and/or other supported
services. Can check the output of local commands as well.

positional arguments:
Expand Down
14 changes: 10 additions & 4 deletions docs/differs.rst
Expand Up @@ -367,11 +367,17 @@ Optional directives
```````````````````
This differ is currently in BETA and the directives may change in the future.

* ``data_type`` (``url``, ``filename``, or ``base_64``): What the data represent: a link to the image, the path to the
file containing the image or the image itself as `Base_64 <https://en.wikipedia.org/wiki/Base64>`__ (default:
``url``).
* ``data_type`` (``url``, ``filename``, ``ascii85`` or ``base64``): What the data represent: a link to the image, the
path to the file containing the image or the image itself as `Ascii85 <https://en.wikipedia.org/wiki/Ascii85>`__ or
`RFC 4648 <https://datatracker.ietf.org/doc/html/rfc4648.html>`__ `Base_64 <https://en.wikipedia.org/wiki/Base64>`__
text (default: ``url``).
* ``mse_threshold`` (float): The minimum mean squared error (MSE) between two images to consider them changed;
requires the package ``numppy`` to be installed (default: 2.5).
requires the package ``numpy`` to be installed (default: 2.5).

.. note:: If you pass a ``url`` or ``filename`` to the differ, it will detect changes only if the url or
filename changes, not if the image behind the url/filename does. To detect changes in an image when the url or
filename doesn't change, build a job that captures the image itself encoded in Ascii85 or Base64 (potentially using
the :ref:`ascii85` filter) and set ``data_type: ascii85`` or ``data_type: base64``.

Required packages
`````````````````
Expand Down
43 changes: 43 additions & 0 deletions docs/filters.rst
Expand Up @@ -68,6 +68,10 @@ At the moment, the following filters are available:
- :ref:`pypdf`: Convert PDF to plaintext.
- :ref:`pdf2text`: Convert PDF to plaintext (Poppler required as an external dependency).

* To save images:

- :ref:`ascii85`: Convert binary data such as images to text (for downstream differ :ref:`image_diff`).

* To extract text from images:

- :ref:`ocr`: Extract text from images.
Expand Down Expand Up @@ -145,6 +149,45 @@ attribute of the ``<object>`` tag, to absolute ones.
of the ``<object>`` tag.



.. _ascii85:

ascii85
-------
Encodes binary data (e.g. image data) to text using `Ascii85 <https://en.wikipedia.org/wiki/Ascii85>`__. Ascii85 is
more space-efficient than Base64, encoding more bytes into fewer characters. This filter can be useful to monitor
images in combination with the :ref:`image_diff` differ.

.. code-block:: yaml
url: https://example.net/favicon_85.ico
filter:
- ascii85
.. versionadded:: 3.21


..
.. _base64:
base64
------
Encodes binary data (e.g. image data) to text using `RFC 4648 <https://datatracker.ietf.org/doc/html/rfc4648.html>`__
`Base64 <https://en.wikipedia.org/wiki/Base64>`__. This filter can be useful to monitor images in combination with
the :ref:`image_diff` differ. Also see :ref:`ascii85`, which is more efficient.

.. code-block:: yaml
url: https://example.net/favicon.ico
filter:
- base64
.. versionadded:: 3.16



.. _beautify:

beautify
Expand Down
5 changes: 3 additions & 2 deletions docs/make_html.bat
@@ -1,9 +1,10 @@
@ECHO OFF

REM Command file to generate Sphinx documentation in Windows

setlocal EnableDelayedExpansion
pushd %~dp0

REM Command file to generate Sphinx documentation

if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Expand Up @@ -15,7 +15,8 @@ build-backend = "setuptools.build_meta"
dynamic = ['version', 'dependencies']
name = 'webchanges'
description = """\
Check web (or command output) for changes since last run and notify. Anonymously alerts you of web changes.\
Check web (or command output) for changes since last run and notify. Anonymously alerts you of web changes, with
Gen AI summaries (BETA).\
"""
readme = { file = 'README.rst', content-type = 'text/x-rst' }
requires-python = '>=3.9'
Expand Down
8 changes: 7 additions & 1 deletion tests/data/docs_filters_testdata.yaml
Expand Up @@ -294,6 +294,12 @@ https://example.com/html2text_strip_tags.html:
</body>
</html>
output: " Date#Sales™\n Monday, 3 February 202010,000\n Tu, 3 Mar20,000"
https://example.net/favicon_85.ico:
input: # in code, since it's binary
output: a
https://example.net/favicon.ico:
input: # in code, since it's binary
output: a
https://example.net/beautify.html:
input: |
<!DOCTYPE html>
Expand Down Expand Up @@ -659,7 +665,7 @@ https://example.net/execute.html:
The data is 'TEST'
The job location is 'https://example.net/execute.html'
The job name is 'Test execute filter'
The job number is '21'
The job number is '23'
The job JSON is '{"filter": [{"execute": "python -c \"import os, sys; print(f\\\"The data is '{sys.stdin.read()}'\\nThe job location is '{os.getenv('WEBCHANGES_JOB_LOCATION')}'\\nThe job name is '{os.getenv('WEBCHANGES_JOB_NAME')}'\\nThe job number is '{os.getenv('WEBCHANGES_JOB_INDEX_NUMBER')}'\\nThe job JSON is '{os.getenv('WEBCHANGES_JOB_JSON')}'\\\", end='')\""}], "index_number": 58, "name": "Test execute filter", "url": "https://example.net/execute.html"}'
https://example.net/shellpipe.html:
input: |
Expand Down

0 comments on commit 90db268

Please sign in to comment.