Skip to content

Commit

Permalink
Formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
akoumjian committed Dec 27, 2018
1 parent 6602b1f commit 5e24e8d
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 97 deletions.
5 changes: 2 additions & 3 deletions README.rst
Expand Up @@ -21,9 +21,8 @@ datefinder - extract dates from text
A python module for locating dates inside text. Use this package to extract all sorts
of date like strings from a document and turn them into datetime objects.

This module finds the likely datetime strings and then uses the
`dateparser <https://github.com/scrapinghub/dateparser>`_ package to convert
to the datetime object.
This module finds the likely datetime strings and then uses
`dateutil`to convert to the datetime object.


Installation
Expand Down
55 changes: 32 additions & 23 deletions datefinder/__init__.py
Expand Up @@ -3,9 +3,15 @@
import regex as re
from dateutil import tz, parser

from .constants import REPLACEMENTS, TIMEZONE_REPLACEMENTS, STRIP_CHARS, DATE_REGEX, RANGE_REGEX
from .constants import (
REPLACEMENTS,
TIMEZONE_REPLACEMENTS,
STRIP_CHARS,
DATE_REGEX,
RANGE_REGEX,
)

logger = logging.getLogger('datefinder')
logger = logging.getLogger("datefinder")


class DateFinder(object):
Expand All @@ -18,7 +24,9 @@ def __init__(self, base_date=None):

def find_dates(self, text, source=False, index=False, strict=False):

for date_string, indices, captures in self.extract_date_strings(text, strict=strict):
for date_string, indices, captures in self.extract_date_strings(
text, strict=strict
):

as_dt = self.parse_date_string(date_string, captures)
if as_dt is None:
Expand All @@ -44,8 +52,8 @@ def _find_and_replace(self, date_string, captures):
"""
# add timezones to replace
cloned_replacements = copy.copy(REPLACEMENTS) # don't mutate
for tz_string in captures.get('timezones', []):
cloned_replacements.update({tz_string: ' '})
for tz_string in captures.get("timezones", []):
cloned_replacements.update({tz_string: " "})

date_string = date_string.lower()
for key, replacement in cloned_replacements.items():
Expand All @@ -55,9 +63,14 @@ def _find_and_replace(self, date_string, captures):
# 2. match ' to'
# 3. match ' to '
# but never match r'(\s|)to(\s|)' which would make 'october' > 'ocber'
date_string = re.sub(r'(^|\s)' + key + r'(\s|$)', replacement, date_string, flags=re.IGNORECASE)
date_string = re.sub(
r"(^|\s)" + key + r"(\s|$)",
replacement,
date_string,
flags=re.IGNORECASE,
)

return date_string, self._pop_tz_string(sorted(captures.get('timezones', [])))
return date_string, self._pop_tz_string(sorted(captures.get("timezones", [])))

def _pop_tz_string(self, list_of_timezones):
try:
Expand All @@ -66,7 +79,7 @@ def _pop_tz_string(self, list_of_timezones):
# want replaced with better abbreviation
return TIMEZONE_REPLACEMENTS.get(tz_string, tz_string)
except IndexError:
return ''
return ""

def _add_tzinfo(self, datetime_obj, tz_string):
"""
Expand Down Expand Up @@ -98,7 +111,7 @@ def parse_date_string(self, date_string, captures):
return None

try:
logger.debug('Parsing {0} with dateutil'.format(date_string))
logger.debug("Parsing {0} with dateutil".format(date_string))
as_dt = parser.parse(date_string, default=self.base_date)
except Exception as e:
logger.debug(e)
Expand Down Expand Up @@ -127,12 +140,12 @@ def extract_date_strings(self, text, strict=False):

for dt2_str in dt2:
range_strings.extend(self.extract_date_strings(dt2_str, strict=strict))

found_range = True

for range_string in range_strings:
yield range_string

# Try to match regular datetimes if no ranges have been found
if not found_range:
for match in DATE_REGEX.finditer(text):
Expand All @@ -142,10 +155,10 @@ def extract_date_strings(self, text, strict=False):
## Get individual group matches
captures = match.capturesdict()
# time = captures.get('time')
digits = captures.get('digits')
digits = captures.get("digits")
# digits_modifiers = captures.get('digits_modifiers')
# days = captures.get('days')
months = captures.get('months')
months = captures.get("months")
# timezones = captures.get('timezones')
# delimiters = captures.get('delimiters')
# time_periods = captures.get('time_periods')
Expand All @@ -155,28 +168,24 @@ def extract_date_strings(self, text, strict=False):
complete = False
if len(digits) == 3: # 12-05-2015
complete = True
elif (len(months) == 1) and (len(digits) == 2): # 19 February 2013 year 09:10
elif (len(months) == 1) and (
len(digits) == 2
): # 19 February 2013 year 09:10
complete = True

if not complete:
continue

## sanitize date string
## replace unhelpful whitespace characters with single whitespace
match_str = re.sub(r'[\n\t\s\xa0]+', ' ', match_str)
match_str = re.sub(r"[\n\t\s\xa0]+", " ", match_str)
match_str = match_str.strip(STRIP_CHARS)

## Save sanitized source string
yield match_str, indices, captures
yield match_str, indices, captures


def find_dates(
text,
source=False,
index=False,
strict=False,
base_date=None
):
def find_dates(text, source=False, index=False, strict=False, base_date=None):
"""
Extract datetime strings from text
Expand Down
79 changes: 49 additions & 30 deletions datefinder/constants.py
@@ -1,37 +1,48 @@
import regex as re

NUMBERS_PATTERN = r'first|second|third|fourth|fifth|sixth|seventh|eighth|nineth|tenth'
POSITIONNAL_TOKENS= r'next|last'
DIGITS_PATTERN = r'\d+'
DIGITS_SUFFIXES= r'st|th|rd|nd'
DAYS_PATTERN = 'monday|tuesday|wednesday|thursday|friday|saturday|sunday|mon|tue|tues|wed|thur|thurs|fri|sat|sun'
MONTHS_PATTERN = r'january|february|march|april|may|june|july|august|september|october|november|december|jan\.?|feb\.?|mar\.?|apr\.?|may\.?|jun\.?|jul\.?|aug\.?|sep\.?|sept\.?|oct\.?|nov\.?|dec\.?'
TIMEZONES_PATTERN = 'ACDT|ACST|ACT|ACWDT|ACWST|ADDT|ADMT|ADT|AEDT|AEST|AFT|AHDT|AHST|AKDT|AKST|AKTST|AKTT|ALMST|ALMT|AMST|AMT|ANAST|ANAT|ANT|APT|AQTST|AQTT|ARST|ART|ASHST|ASHT|AST|AWDT|AWST|AWT|AZOMT|AZOST|AZOT|AZST|AZT|BAKST|BAKT|BDST|BDT|BEAT|BEAUT|BIOT|BMT|BNT|BORT|BOST|BOT|BRST|BRT|BST|BTT|BURT|CANT|CAPT|CAST|CAT|CAWT|CCT|CDDT|CDT|CEDT|CEMT|CEST|CET|CGST|CGT|CHADT|CHAST|CHDT|CHOST|CHOT|CIST|CKHST|CKT|CLST|CLT|CMT|COST|COT|CPT|CST|CUT|CVST|CVT|CWT|CXT|ChST|DACT|DAVT|DDUT|DFT|DMT|DUSST|DUST|EASST|EAST|EAT|ECT|EDDT|EDT|EEDT|EEST|EET|EGST|EGT|EHDT|EMT|EPT|EST|ET|EWT|FET|FFMT|FJST|FJT|FKST|FKT|FMT|FNST|FNT|FORT|FRUST|FRUT|GALT|GAMT|GBGT|GEST|GET|GFT|GHST|GILT|GIT|GMT|GST|GYT|HAA|HAC|HADT|HAE|HAP|HAR|HAST|HAT|HAY|HDT|HKST|HKT|HLV|HMT|HNA|HNC|HNE|HNP|HNR|HNT|HNY|HOVST|HOVT|HST|ICT|IDDT|IDT|IHST|IMT|IOT|IRDT|IRKST|IRKT|IRST|ISST|IST|JAVT|JCST|JDT|JMT|JST|JWST|KART|KDT|KGST|KGT|KIZST|KIZT|KMT|KOST|KRAST|KRAT|KST|KUYST|KUYT|KWAT|LHDT|LHST|LINT|LKT|LMT|LMT|LMT|LMT|LRT|LST|MADMT|MADST|MADT|MAGST|MAGT|MALST|MALT|MART|MAWT|MDDT|MDST|MDT|MEST|MET|MHT|MIST|MIT|MMT|MOST|MOT|MPT|MSD|MSK|MSM|MST|MUST|MUT|MVT|MWT|MYT|NCST|NCT|NDDT|NDT|NEGT|NEST|NET|NFT|NMT|NOVST|NOVT|NPT|NRT|NST|NT|NUT|NWT|NZDT|NZMT|NZST|OMSST|OMST|ORAST|ORAT|PDDT|PDT|PEST|PET|PETST|PETT|PGT|PHOT|PHST|PHT|PKST|PKT|PLMT|PMDT|PMMT|PMST|PMT|PNT|PONT|PPMT|PPT|PST|PT|PWT|PYST|PYT|QMT|QYZST|QYZT|RET|RMT|ROTT|SAKST|SAKT|SAMT|SAST|SBT|SCT|SDMT|SDT|SET|SGT|SHEST|SHET|SJMT|SLT|SMT|SRET|SRT|SST|STAT|SVEST|SVET|SWAT|SYOT|TAHT|TASST|TAST|TBIST|TBIT|TBMT|TFT|THA|TJT|TKT|TLT|TMT|TOST|TOT|TRST|TRT|TSAT|TVT|ULAST|ULAT|URAST|URAT|UTC|UYHST|UYST|UYT|UZST|UZT|VET|VLAST|VLAT|VOLST|VOLT|VOST|VUST|VUT|WARST|WART|WAST|WAT|WDT|WEDT|WEMT|WEST|WET|WFT|WGST|WGT|WIB|WIT|WITA|WMT|WSDT|WSST|WST|WT|XJT|YAKST|YAKT|YAPT|YDDT|YDT|YEKST|YEKST|YEKT|YEKT|YERST|YERT|YPT|YST|YWT|zzz'
NUMBERS_PATTERN = r"first|second|third|fourth|fifth|sixth|seventh|eighth|nineth|tenth"
POSITIONNAL_TOKENS = r"next|last"
DIGITS_PATTERN = r"\d+"
DIGITS_SUFFIXES = r"st|th|rd|nd"
DAYS_PATTERN = "monday|tuesday|wednesday|thursday|friday|saturday|sunday|mon|tue|tues|wed|thur|thurs|fri|sat|sun"
MONTHS_PATTERN = r"january|february|march|april|may|june|july|august|september|october|november|december|jan\.?|feb\.?|mar\.?|apr\.?|may\.?|jun\.?|jul\.?|aug\.?|sep\.?|sept\.?|oct\.?|nov\.?|dec\.?"
TIMEZONES_PATTERN = "ACDT|ACST|ACT|ACWDT|ACWST|ADDT|ADMT|ADT|AEDT|AEST|AFT|AHDT|AHST|AKDT|AKST|AKTST|AKTT|ALMST|ALMT|AMST|AMT|ANAST|ANAT|ANT|APT|AQTST|AQTT|ARST|ART|ASHST|ASHT|AST|AWDT|AWST|AWT|AZOMT|AZOST|AZOT|AZST|AZT|BAKST|BAKT|BDST|BDT|BEAT|BEAUT|BIOT|BMT|BNT|BORT|BOST|BOT|BRST|BRT|BST|BTT|BURT|CANT|CAPT|CAST|CAT|CAWT|CCT|CDDT|CDT|CEDT|CEMT|CEST|CET|CGST|CGT|CHADT|CHAST|CHDT|CHOST|CHOT|CIST|CKHST|CKT|CLST|CLT|CMT|COST|COT|CPT|CST|CUT|CVST|CVT|CWT|CXT|ChST|DACT|DAVT|DDUT|DFT|DMT|DUSST|DUST|EASST|EAST|EAT|ECT|EDDT|EDT|EEDT|EEST|EET|EGST|EGT|EHDT|EMT|EPT|EST|ET|EWT|FET|FFMT|FJST|FJT|FKST|FKT|FMT|FNST|FNT|FORT|FRUST|FRUT|GALT|GAMT|GBGT|GEST|GET|GFT|GHST|GILT|GIT|GMT|GST|GYT|HAA|HAC|HADT|HAE|HAP|HAR|HAST|HAT|HAY|HDT|HKST|HKT|HLV|HMT|HNA|HNC|HNE|HNP|HNR|HNT|HNY|HOVST|HOVT|HST|ICT|IDDT|IDT|IHST|IMT|IOT|IRDT|IRKST|IRKT|IRST|ISST|IST|JAVT|JCST|JDT|JMT|JST|JWST|KART|KDT|KGST|KGT|KIZST|KIZT|KMT|KOST|KRAST|KRAT|KST|KUYST|KUYT|KWAT|LHDT|LHST|LINT|LKT|LMT|LMT|LMT|LMT|LRT|LST|MADMT|MADST|MADT|MAGST|MAGT|MALST|MALT|MART|MAWT|MDDT|MDST|MDT|MEST|MET|MHT|MIST|MIT|MMT|MOST|MOT|MPT|MSD|MSK|MSM|MST|MUST|MUT|MVT|MWT|MYT|NCST|NCT|NDDT|NDT|NEGT|NEST|NET|NFT|NMT|NOVST|NOVT|NPT|NRT|NST|NT|NUT|NWT|NZDT|NZMT|NZST|OMSST|OMST|ORAST|ORAT|PDDT|PDT|PEST|PET|PETST|PETT|PGT|PHOT|PHST|PHT|PKST|PKT|PLMT|PMDT|PMMT|PMST|PMT|PNT|PONT|PPMT|PPT|PST|PT|PWT|PYST|PYT|QMT|QYZST|QYZT|RET|RMT|ROTT|SAKST|SAKT|SAMT|SAST|SBT|SCT|SDMT|SDT|SET|SGT|SHEST|SHET|SJMT|SLT|SMT|SRET|SRT|SST|STAT|SVEST|SVET|SWAT|SYOT|TAHT|TASST|TAST|TBIST|TBIT|TBMT|TFT|THA|TJT|TKT|TLT|TMT|TOST|TOT|TRST|TRT|TSAT|TVT|ULAST|ULAT|URAST|URAT|UTC|UYHST|UYST|UYT|UZST|UZT|VET|VLAST|VLAT|VOLST|VOLT|VOST|VUST|VUT|WARST|WART|WAST|WAT|WDT|WEDT|WEMT|WEST|WET|WFT|WGST|WGT|WIB|WIT|WITA|WMT|WSDT|WSST|WST|WT|XJT|YAKST|YAKT|YAPT|YDDT|YDT|YEKST|YEKST|YEKT|YEKT|YERST|YERT|YPT|YST|YWT|zzz"
## explicit north american timezones that get replaced
NA_TIMEZONES_PATTERN = 'pacific|eastern|mountain|central'
ALL_TIMEZONES_PATTERN = TIMEZONES_PATTERN + '|' + NA_TIMEZONES_PATTERN
DELIMITERS_PATTERN = r'[/\:\-\,\s\_\+\@]+'
NA_TIMEZONES_PATTERN = "pacific|eastern|mountain|central"
ALL_TIMEZONES_PATTERN = TIMEZONES_PATTERN + "|" + NA_TIMEZONES_PATTERN
DELIMITERS_PATTERN = r"[/\:\-\,\s\_\+\@]+"

# Allows for straightforward datestamps e.g 2017, 201712, 20171223. Created with:
# YYYYMM_PATTERN = '|'.join(['19\d\d'+'{:0>2}'.format(mon)+'|20\d\d'+'{:0>2}'.format(mon) for mon in range(1, 13)])
# YYYYMMDD_PATTERN = '|'.join(['19\d\d'+'{:0>2}'.format(mon)+'[0123]\d|20\d\d'+'{:0>2}'.format(mon)+'[0123]\d' for mon in range(1, 13)])
YYYY_PATTERN = r'19\d\d|20\d\d'
YYYYMM_PATTERN = r'19\d\d01|20\d\d01|19\d\d02|20\d\d02|19\d\d03|20\d\d03|19\d\d04|20\d\d04|19\d\d05|20\d\d05|19\d\d06|20\d\d06|19\d\d07|20\d\d07|19\d\d08|20\d\d08|19\d\d09|20\d\d09|19\d\d10|20\d\d10|19\d\d11|20\d\d11|19\d\d12|20\d\d12'
YYYYMMDD_PATTERN = r'19\d\d01[0123]\d|20\d\d01[0123]\d|19\d\d02[0123]\d|20\d\d02[0123]\d|19\d\d03[0123]\d|20\d\d03[0123]\d|19\d\d04[0123]\d|20\d\d04[0123]\d|19\d\d05[0123]\d|20\d\d05[0123]\d|19\d\d06[0123]\d|20\d\d06[0123]\d|19\d\d07[0123]\d|20\d\d07[0123]\d|19\d\d08[0123]\d|20\d\d08[0123]\d|19\d\d09[0123]\d|20\d\d09[0123]\d|19\d\d10[0123]\d|20\d\d10[0123]\d|19\d\d11[0123]\d|20\d\d11[0123]\d|19\d\d12[0123]\d|20\d\d12[0123]\d'
YYYYMMDDHHMMSS_PATTERN = '|'.join([r'19\d\d' + '{:0>2}'.format(mon) + r'[0-3]\d[0-5]\d[0-5]\d[0-5]\d|20\d\d' + '{:0>2}'.format(mon) + r'[0-3]\d[0-5]\d[0-5]\d[0-5]\d' for mon in range(1, 13)])
ISO8601_PATTERN = r'(?P<years>-?(\:[1-9][0-9]*)?[0-9]{4})\-(?P<months>1[0-2]|0[1-9])\-(?P<days>3[01]|0[1-9]|[12][0-9])T(?P<hours>2[0-3]|[01][0-9])\:(?P<minutes>[0-5][0-9]):(?P<seconds>[0-5][0-9])(?:[\.,]+(?P<microseconds>[0-9]+))?(?P<offset>(?:Z|[+-](?:2[0-3]|[01][0-9])\:[0-5][0-9]))?'
UNDELIMITED_STAMPS_PATTERN = '|'.join([YYYYMMDDHHMMSS_PATTERN, YYYYMMDD_PATTERN, YYYYMM_PATTERN, ISO8601_PATTERN])
DELIMITERS_PATTERN = r'[/\:\-\,\.\s\_\+\@]+'
TIME_PERIOD_PATTERN = r'a\.m\.|am|p\.m\.|pm'
YYYY_PATTERN = r"19\d\d|20\d\d"
YYYYMM_PATTERN = r"19\d\d01|20\d\d01|19\d\d02|20\d\d02|19\d\d03|20\d\d03|19\d\d04|20\d\d04|19\d\d05|20\d\d05|19\d\d06|20\d\d06|19\d\d07|20\d\d07|19\d\d08|20\d\d08|19\d\d09|20\d\d09|19\d\d10|20\d\d10|19\d\d11|20\d\d11|19\d\d12|20\d\d12"
YYYYMMDD_PATTERN = r"19\d\d01[0123]\d|20\d\d01[0123]\d|19\d\d02[0123]\d|20\d\d02[0123]\d|19\d\d03[0123]\d|20\d\d03[0123]\d|19\d\d04[0123]\d|20\d\d04[0123]\d|19\d\d05[0123]\d|20\d\d05[0123]\d|19\d\d06[0123]\d|20\d\d06[0123]\d|19\d\d07[0123]\d|20\d\d07[0123]\d|19\d\d08[0123]\d|20\d\d08[0123]\d|19\d\d09[0123]\d|20\d\d09[0123]\d|19\d\d10[0123]\d|20\d\d10[0123]\d|19\d\d11[0123]\d|20\d\d11[0123]\d|19\d\d12[0123]\d|20\d\d12[0123]\d"
YYYYMMDDHHMMSS_PATTERN = "|".join(
[
r"19\d\d"
+ "{:0>2}".format(mon)
+ r"[0-3]\d[0-5]\d[0-5]\d[0-5]\d|20\d\d"
+ "{:0>2}".format(mon)
+ r"[0-3]\d[0-5]\d[0-5]\d[0-5]\d"
for mon in range(1, 13)
]
)
ISO8601_PATTERN = r"(?P<years>-?(\:[1-9][0-9]*)?[0-9]{4})\-(?P<months>1[0-2]|0[1-9])\-(?P<days>3[01]|0[1-9]|[12][0-9])T(?P<hours>2[0-3]|[01][0-9])\:(?P<minutes>[0-5][0-9]):(?P<seconds>[0-5][0-9])(?:[\.,]+(?P<microseconds>[0-9]+))?(?P<offset>(?:Z|[+-](?:2[0-3]|[01][0-9])\:[0-5][0-9]))?"
UNDELIMITED_STAMPS_PATTERN = "|".join(
[YYYYMMDDHHMMSS_PATTERN, YYYYMMDD_PATTERN, YYYYMM_PATTERN, ISO8601_PATTERN]
)
DELIMITERS_PATTERN = r"[/\:\-\,\.\s\_\+\@]+"
TIME_PERIOD_PATTERN = r"a\.m\.|am|p\.m\.|pm"
## can be in date strings but not recognized by dateutils
EXTRA_TOKENS_PATTERN = r'due|by|on|during|standard|daylight|savings|time|date|dated|of|to|through|between|until|at|day'
EXTRA_TOKENS_PATTERN = r"due|by|on|during|standard|daylight|savings|time|date|dated|of|to|through|between|until|at|day"

## TODO: Get english numbers?
## http://www.rexegg.com/regex-trick-numbers-in-english.html

RELATIVE_PATTERN = 'before|after|next|last|ago'
TIME_SHORTHAND_PATTERN = 'noon|midnight|today|yesterday'
UNIT_PATTERN = 'second|minute|hour|day|week|month|year'
RELATIVE_PATTERN = "before|after|next|last|ago"
TIME_SHORTHAND_PATTERN = "noon|midnight|today|yesterday"
UNIT_PATTERN = "second|minute|hour|day|week|month|year"

## Time pattern is used independently, so specified here.
TIME_PATTERN = r"""
Expand Down Expand Up @@ -60,8 +71,7 @@
)
)
""".format(
time_periods=TIME_PERIOD_PATTERN,
timezones=ALL_TIMEZONES_PATTERN
time_periods=TIME_PERIOD_PATTERN, timezones=ALL_TIMEZONES_PATTERN
)

DATES_PATTERN = """
Expand Down Expand Up @@ -111,7 +121,7 @@
months=MONTHS_PATTERN,
delimiters=DELIMITERS_PATTERN,
positionnal_tokens=POSITIONNAL_TOKENS,
extra_tokens=EXTRA_TOKENS_PATTERN
extra_tokens=EXTRA_TOKENS_PATTERN,
)

RANGE_PATTERN = r"""
Expand All @@ -120,13 +130,21 @@
[\s]?(to|through)[\s]?
(?P<dt2>{date_pattern})
)
""".format(date_pattern=DATES_PATTERN)
""".format(
date_pattern=DATES_PATTERN
)

DATE_REGEX = re.compile(DATES_PATTERN, re.IGNORECASE | re.MULTILINE | re.UNICODE | re.DOTALL | re.VERBOSE)
DATE_REGEX = re.compile(
DATES_PATTERN, re.IGNORECASE | re.MULTILINE | re.UNICODE | re.DOTALL | re.VERBOSE
)

TIME_REGEX = re.compile(TIME_PATTERN, re.IGNORECASE | re.MULTILINE | re.UNICODE | re.DOTALL | re.VERBOSE)
TIME_REGEX = re.compile(
TIME_PATTERN, re.IGNORECASE | re.MULTILINE | re.UNICODE | re.DOTALL | re.VERBOSE
)

RANGE_REGEX = re.compile(RANGE_PATTERN, re.IGNORECASE | re.MULTILINE | re.UNICODE | re.DOTALL | re.VERBOSE)
RANGE_REGEX = re.compile(
RANGE_PATTERN, re.IGNORECASE | re.MULTILINE | re.UNICODE | re.DOTALL | re.VERBOSE
)

## These tokens can be in original text but dateutil
## won't handle them without modification
Expand All @@ -151,4 +169,5 @@
}

## Characters that can be removed from ends of matched strings
STRIP_CHARS = ' \n\t:-.,_'
STRIP_CHARS = " \n\t:-.,_"

0 comments on commit 5e24e8d

Please sign in to comment.