Add tags to jobs and a command line arg to select them (#789)

This allows the user to selectively run jobs based on a tag for use cases where you might want to run urlwatch on different schedules for different jobs and other use cases. --------- Signed-off-by: James Hewitt <james.hewitt@uk.ibm.com> Co-authored-by: Thomas Perl <m@thp.io>
thp · Apr 24, 2024 · 1b045b6 · 1b045b6
1 parent 342dd6a
commit 1b045b6
Show file tree

Hide file tree

Showing 11 changed files with 188 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ The format mostly follows [Keep a Changelog](http://keepachangelog.com/en/1.0.0/
 - New `enabled` option for all jobs. Set to false to disable a job without needing to remove it or comment it out (Requested in #625 by snowman, contributed in #785 by jamstah)
 - New option `ignore_incomplete_reads` (Requested in #725 by wschoot, contributed in #787 by wfrisch)
 - New option `wait_for` in browser jobs (Requested in #763 by yuis-ice, contributed in #810 by jamstah)
+- Added tags to jobs and the ability to select them at the command line (#789 by jamstah)
 
 ### Changed
 

diff --git a/docs/source/jobs.rst b/docs/source/jobs.rst
@@ -172,6 +172,7 @@ Optional keys for all job types
 -------------------------------
 
 - ``name``: Human-readable name/label of the job
+- ``tags``: Array of tags
 - ``filter``: :doc:`filters` (if any) to apply to the output (can be tested with ``--test-filter``)
 - ``max_tries``: After this many sequential failed runs, the error will be reported rather than ignored
 - ``diff_tool``: Command to a custom tool for generating diff text

diff --git a/docs/source/manpage.rst b/docs/source/manpage.rst
@@ -23,13 +23,17 @@ This manpage describes the CLI tool.
 
 positional arguments:
    JOB
-          index of job(s) to run, as numbered according to the --list command.
-          If none are specified, then all jobs will be run.
+          indexes or tags of job(s) to run.
+          If --tags is set, each JOB is a tag,
+          if not, each JOB is an index numbered according to the --list command.
 
 optional arguments:
    -h, --help
           show this help message and exit
 
+   --tags
+          use tags instead of indexes to select jobs to run
+
    --version
           show program's version number and exit
 

diff --git a/lib/urlwatch/command.py b/lib/urlwatch/command.py
@@ -95,16 +95,16 @@ def show_features(self):
         return 0
 
     def list_urls(self):
-        for idx, job in enumerate(self.urlwatcher.jobs):
+        for idx, job in enumerate(self.urlwatcher.jobs, 1):
             if self.urlwatch_config.verbose:
-                print('%d: %s' % (idx + 1, repr(job)))
+                print('%d: %s' % (idx, repr(job)))
             else:
                 pretty_name = job.pretty_name()
                 location = job.get_location()
                 if pretty_name != location:
-                    print('%d: %s ( %s )' % (idx + 1, pretty_name, location))
+                    print('%d: %s ( %s )' % (idx, pretty_name, location))
                 else:
-                    print('%d: %s' % (idx + 1, pretty_name))
+                    print('%d: %s' % (idx, pretty_name))
         return 0
 
     def _find_job(self, query):

diff --git a/lib/urlwatch/config.py b/lib/urlwatch/config.py
@@ -64,12 +64,13 @@ def __init__(self, args, pkgname, urlwatch_dir, prefix, config, urls, hooks, cac
         self.parse_args(args)
 
     def parse_args(self, cmdline_args):
-
         parser = argparse.ArgumentParser(description=urlwatch.__doc__,
                                          formatter_class=argparse.RawDescriptionHelpFormatter)
-        parser.add_argument('joblist', metavar='JOB', type=int, nargs="*", help='index of job(s) to run, as numbered according to the --list command. If none specified, then all jobs will be run.')
+        parser.add_argument('joblist', metavar='JOB', type=str, nargs="*", help='indexes or tags of job(s) to run, depending on --tags. If using indexes, they are as numbered according to the --list command. If none are specified, then all jobs will be run.')
+        parser.add_argument('--tags', action='store_true', help='Use tags instead of indexes to select jobs to run')
         parser.add_argument('--version', action='version', version='%(prog)s {}'.format(urlwatch.__version__))
         parser.add_argument('-v', '--verbose', action='store_true', help='show debug output')
+
         group = parser.add_argument_group('files and directories')
         group.add_argument('--urls', metavar='FILE', help='read job list (URLs) from FILE',
                            default=self.urls)
@@ -95,17 +96,29 @@ def parse_args(self, cmdline_args):
         group.add_argument('--test-diff-filter', metavar='JOB',
                            help='test diff filter output of job by location or index (needs at least 2 snapshots)')
         group.add_argument('--dump-history', metavar='JOB', help='dump historical cached data for a job')
+
         group = parser.add_argument_group('interactive commands ($EDITOR/$VISUAL)')
         group.add_argument('--edit', action='store_true', help='edit URL/job list')
         group.add_argument('--edit-config', action='store_true', help='edit configuration file')
         group.add_argument('--edit-hooks', action='store_true', help='edit hooks script')
+
         group = parser.add_argument_group('miscellaneous')
         group.add_argument('--features', action='store_true', help='list supported jobs/filters/reporters')
         group.add_argument('--gc-cache', metavar='RETAIN_LIMIT', type=int, help='remove old cache entries, keeping the latest RETAIN_LIMIT (default: 1)',
                            nargs='?', const=1)
 
         args = parser.parse_args(cmdline_args)
 
-        for i, arg in enumerate(vars(args)):
+        if args.tags:
+            if not args.joblist:
+                raise SystemExit("No tags specified")
+            self.tag_set = frozenset(args.joblist)
+        else:
+            try:
+                self.idx_set = frozenset(int(s) for s in args.joblist)
+            except ValueError as e:
+                parser.error(e)
+
+        for arg in vars(args):
             argval = getattr(args, arg)
             setattr(self, arg, argval)
diff --git a/lib/urlwatch/jobs.py b/lib/urlwatch/jobs.py
@@ -35,6 +35,7 @@
 import re
 import subprocess
 import textwrap
+from typing import Iterable, Optional, Set, FrozenSet
 
 import requests
 from requests.packages.urllib3.exceptions import InsecureRequestWarning
@@ -196,7 +197,10 @@ def ignore_error(self, exception):
 
 class Job(JobBase):
     __required__ = ()
-    __optional__ = ('name', 'filter', 'max_tries', 'diff_tool', 'compared_versions', 'diff_filter', 'enabled', 'treat_new_as_changed', 'user_visible_url')
+    __optional__ = ('name', 'filter', 'max_tries', 'diff_tool', 'compared_versions', 'diff_filter', 'enabled', 'treat_new_as_changed', 'user_visible_url', 'tags')
+
+    def matching_tags(self, tags: Set[str]) -> Set[str]:
+        return self.tags & tags
 
     # determine if hyperlink "a" tag is used in HtmlReporter
     def location_is_url(self):
@@ -208,6 +212,17 @@ def pretty_name(self):
     def is_enabled(self):
         return self.enabled is None or self.enabled
 
+    @property
+    def tags(self) -> Optional[FrozenSet[str]]:
+        return self._tags
+
+    @tags.setter
+    def tags(self, value: Optional[Iterable[str]]):
+        if value is None:
+            self._tags = None
+        else:
+            self._tags = frozenset(value)
+
 
 class ShellJob(Job):
     """Run a shell command and get its standard output"""

diff --git a/lib/urlwatch/main.py b/lib/urlwatch/main.py
@@ -68,6 +68,21 @@ def __init__(self, urlwatch_config, config_storage, cache_storage, urls_storage)
         if hasattr(self.urlwatch_config, 'migrate_urls'):
             self.urlwatch_config.migrate_cache(self)
 
+    def should_run(self, idx, job):
+        if not job.is_enabled():
+            return False
+
+        # Tag mode and tag(s) were specified
+        if self.urlwatch_config.tags and self.urlwatch_config.tag_set:
+            return job.matching_tags(self.urlwatch_config.tag_set)
+
+        # Index mode and index(es) were specified
+        if not self.urlwatch_config.tags and self.urlwatch_config.idx_set:
+            return idx in self.urlwatch_config.idx_set
+
+        # Either mode, and no jobs were specified
+        return True
+
     def check_directories(self):
         if not os.path.exists(self.urlwatch_config.config):
             self.config_storage.write_default_config(self.urlwatch_config.config)

diff --git a/lib/urlwatch/reporters.py b/lib/urlwatch/reporters.py
@@ -313,7 +313,7 @@ def submit(self):
             sep = (line_length * '=') or None
             yield from (part for part in itertools.chain(
                 (sep,),
-                ('%02d. %s' % (idx + 1, line) for idx, line in enumerate(summary)),
+                ('%02d. %s' % (idx, line) for idx, line in enumerate(summary, 1)),
                 (sep, ''),
             ) if part is not None)
 
@@ -860,7 +860,7 @@ def _render(cls, max_length, summary=None, details=None, footer=None):
         # The footer/summary lengths are the sum of the length of their parts
         # plus the space taken up by newlines.
         if summary:
-            summary = ['%d. %s' % (idx + 1, line) for idx, line in enumerate(summary)]
+            summary = ['%d. %s' % (idx, line) for idx, line in enumerate(summary, 1)]
             summary_len = sum(len(part) for part in summary) + len(summary) - 1
         else:
             summary_len = 0

diff --git a/lib/urlwatch/tests/data/jobs-with-tags.yaml b/lib/urlwatch/tests/data/jobs-with-tags.yaml
@@ -0,0 +1,17 @@
+---
+name: UTC
+command: date -u
+tags:
+  - arg
+  - utc
+---
+name: RFC
+command: date -R
+tags:
+  - arg
+  - rfc
+---
+name: Local
+command: date
+tags:
+  - local
diff --git a/lib/urlwatch/tests/test_handler.py b/lib/urlwatch/tests/test_handler.py
@@ -78,8 +78,8 @@ def test_load_hooks_py():
 
 
 class ConfigForTest(CommandConfig):
-    def __init__(self, config, urls, cache, hooks, verbose):
-        super().__init__([], 'urlwatch', os.path.dirname(__file__), root, config, urls, hooks, cache, verbose)
+    def __init__(self, config, urls, cache, hooks, verbose, args=()):
+        super().__init__(args, 'urlwatch', os.path.dirname(__file__), root, config, urls, hooks, cache, verbose)
 
 
 @contextlib.contextmanager
@@ -112,6 +112,110 @@ def test_run_watcher():
             cache_storage.close()
 
 
+def prepare_tags_test(args):
+    urls = os.path.join(here, 'data', 'jobs-with-tags.yaml')
+    config = os.path.join(here, 'data', 'urlwatch.yaml')
+    cache = os.path.join(here, 'data', 'cache.db')
+    hooks = ''
+
+    config_storage = YamlConfigStorage(config)
+    urls_storage = UrlsYaml(urls)
+    cache_storage = CacheMiniDBStorage(cache)
+
+    urlwatch_config = ConfigForTest(config, urls, cache, hooks, True, args=args)
+    urlwatcher = Urlwatch(urlwatch_config, config_storage, cache_storage, urls_storage)
+
+    return urlwatcher, cache_storage
+
+
+def test_idxs_none():
+    with teardown_func():
+        urlwatcher, cache_storage = prepare_tags_test([])
+        try:
+            urlwatcher.run_jobs()
+
+            assert len(urlwatcher.report.job_states) == 3
+        finally:
+            cache_storage.close()
+
+
+def test_idxs_zero():
+    with teardown_func():
+        urlwatcher, cache_storage = prepare_tags_test(['0'])
+        try:
+            with pytest.raises(ValueError):
+                urlwatcher.run_jobs()
+        finally:
+            cache_storage.close()
+
+
+def test_idxs_massive():
+    with teardown_func():
+        urlwatcher, cache_storage = prepare_tags_test(['99999'])
+        try:
+            with pytest.raises(ValueError):
+                urlwatcher.run_jobs()
+        finally:
+            cache_storage.close()
+
+
+def test_idxs_nan():
+    with teardown_func():
+        with pytest.raises(SystemExit):
+            ConfigForTest('', '', '', '', True, ['NaN'])
+
+
+def test_idxs_one():
+    with teardown_func():
+        urlwatcher, cache_storage = prepare_tags_test(['1'])
+        try:
+            urlwatcher.run_jobs()
+
+            assert len(urlwatcher.report.job_states) == 1
+            assert urlwatcher.report.job_states[0].job.name == "UTC"
+        finally:
+            cache_storage.close()
+
+
+def test_tags_empty():
+    with teardown_func():
+        with pytest.raises(SystemExit):
+            ConfigForTest('', '', '', '', True, ['--tags'])
+
+
+def test_tags_no_match():
+    with teardown_func():
+        urlwatcher, cache_storage = prepare_tags_test(['--tags', 'foo'])
+        try:
+            urlwatcher.run_jobs()
+
+            assert len(urlwatcher.report.job_states) == 0
+        finally:
+            cache_storage.close()
+
+
+def test_tags_single():
+    with teardown_func():
+        urlwatcher, cache_storage = prepare_tags_test(['--tags', 'arg'])
+        try:
+            urlwatcher.run_jobs()
+
+            assert len(urlwatcher.report.job_states) == 2
+        finally:
+            cache_storage.close()
+
+
+def test_tags_multiple():
+    with teardown_func():
+        urlwatcher, cache_storage = prepare_tags_test(['--tags', 'utc', 'local'])
+        try:
+            urlwatcher.run_jobs()
+
+            assert len(urlwatcher.report.job_states) == 2
+        finally:
+            cache_storage.close()
+
+
 def test_disabled_job():
     with teardown_func():
         urls = os.path.join(here, 'data', 'disabled-job.yaml')

diff --git a/lib/urlwatch/worker.py b/lib/urlwatch/worker.py
@@ -51,11 +51,13 @@ def run_parallel(func, items):
 
 
 def run_jobs(urlwatcher):
-    if not all(1 <= idx <= len(urlwatcher.jobs) for idx in urlwatcher.urlwatch_config.joblist):
+    if not urlwatcher.urlwatch_config.tags and not all(1 <= idx <= len(urlwatcher.jobs) for idx in urlwatcher.urlwatch_config.idx_set):
         raise ValueError(f'All job indices must be between 1 and {len(urlwatcher.jobs)}: {urlwatcher.urlwatch_config.joblist}')
     cache_storage = urlwatcher.cache_storage
     jobs = [job.with_defaults(urlwatcher.config_storage.config)
-            for (idx, job) in enumerate(urlwatcher.jobs) if job.is_enabled() and ((idx + 1) in urlwatcher.urlwatch_config.joblist or (not urlwatcher.urlwatch_config.joblist))]
+            for (idx, job) in enumerate(urlwatcher.jobs, 1)
+            if urlwatcher.should_run(idx, job)
+            ]
     report = urlwatcher.report
 
     logger.debug('Processing %d jobs (out of %d)', len(jobs), len(urlwatcher.jobs))