Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add french language support #177

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.rst
Expand Up @@ -48,14 +48,15 @@ How to Use

In [2]: import datefinder

In [3]: matches = datefinder.find_dates(string_with_dates)
In [3]: matches = datefinder.find_dates(string_with_dates, locale="en_US")

In [4]: for match in matches:
...: print match
...:
2017-01-04 20:00:00
2005-01-15 00:00:00

**Note: The `locale` parameter is optional. If you do not specify a locale, the default is `en_US`.**

Demo
----
Expand Down
16 changes: 13 additions & 3 deletions datefinder/__init__.py
@@ -1,4 +1,6 @@
import copy
import datetime
from locale import setlocale, LC_ALL
import logging
import regex as re
from dateutil import tz, parser
Expand All @@ -21,7 +23,7 @@ class DateFinder(object):
Locates dates in a text
"""

def __init__(self, base_date=None, first="month"):
def __init__(self, base_date=None, first="month", locale="en_US"):
self.base_date = base_date
self.dayfirst = False
self.yearfirst = False
Expand All @@ -30,6 +32,12 @@ def __init__(self, base_date=None, first="month"):
if first == "year":
self.yearfirst = True


setlocale(LC_ALL, locale)
from datefinder.local_parser_info import LocaleParserInfo
self.parserinfo = LocaleParserInfo()


def find_dates(self, text, source=False, index=False, strict=False):

for date_string, indices, captures in self.extract_date_strings(
Expand Down Expand Up @@ -117,6 +125,7 @@ def parse_date_string(self, date_string, captures):
default=self.base_date,
dayfirst=self.dayfirst,
yearfirst=self.yearfirst,
parserinfo=self.parserinfo
)
except (ValueError, OverflowError):
# replace tokens that are problematic for dateutil
Expand All @@ -136,6 +145,7 @@ def parse_date_string(self, date_string, captures):
default=self.base_date,
dayfirst=self.dayfirst,
yearfirst=self.yearfirst,
parserinfo=self.parserinfo
)
except Exception as e:
logger.debug(e)
Expand Down Expand Up @@ -318,7 +328,7 @@ def split_date_range(text):


def find_dates(
text, source=False, index=False, strict=False, base_date=None, first="month"
text, source=False, index=False, strict=False, base_date=None, first="month", locale=None
):
"""
Extract datetime strings from text
Expand Down Expand Up @@ -351,5 +361,5 @@ def find_dates(
:return: Returns a generator that produces :mod:`datetime.datetime` objects,
or a tuple with the source text and index, if requested
"""
date_finder = DateFinder(base_date=base_date, first=first)
date_finder = DateFinder(base_date=base_date, first=first, locale=locale)
return date_finder.find_dates(text, source=source, index=index, strict=strict)
4 changes: 2 additions & 2 deletions datefinder/constants.py
Expand Up @@ -4,8 +4,8 @@
POSITIONNAL_TOKENS = r"next|last"
DIGITS_PATTERN = r"\d+"
DIGITS_SUFFIXES = r"st|th|rd|nd"
DAYS_PATTERN = "monday|tuesday|wednesday|thursday|friday|saturday|sunday|mandag|tirsdag|onsdag|torsdag|fredag|lørdag|søndag|mon|tue|tues|wed|thu|thur|thurs|fri|sat|sun|man|tir|tirs|ons|tor|tors|fre|lør|søn"
MONTHS_PATTERN = r"january|february|march|april|may|june|july|august|september|october|november|december|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januar|februar|marts|april|maj|juni|juli|august|september|oktober|november|december|jan[\.\s]|ene[\.\s]|feb[\.\s]|mar[\.\s]|apr[\.\s]|abr[\.\s]|may[\.\s]|maj[\.\s]|jun[\.\s]|jul[\.\s]|aug[\.\s]|ago[\.\s]|sep[^A-Za-z]|sept[\.\s]|oct[\.\s]|okt[\.\s]|nov[\.\s]|dec[\.\s]|dic[\.\s]"
DAYS_PATTERN = "lundi|mardi|mercredi|jeudi|vendredi|samedi|dimanche|monday|tuesday|wednesday|thursday|friday|saturday|sunday|mandag|tirsdag|onsdag|torsdag|fredag|lørdag|søndag|mon|tue|tues|wed|thu|thur|thurs|fri|sat|sun|man|tir|tirs|ons|tor|tors|fre|lør|søn"
MONTHS_PATTERN = r"janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|january|february|march|april|may|june|july|august|september|october|november|december|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januar|februar|marts|april|maj|juni|juli|august|september|oktober|november|december|jan[\.\s]|ene[\.\s]|feb[\.\s]|mar[\.\s]|apr[\.\s]|abr[\.\s]|may[\.\s]|maj[\.\s]|jun[\.\s]|jul[\.\s]|aug[\.\s]|ago[\.\s]|sep[^A-Za-z]|sept[\.\s]|oct[\.\s]|okt[\.\s]|nov[\.\s]|dec[\.\s]|dic[\.\s]"
TIMEZONES_PATTERN = "ACDT|ACST|ACT|ACWDT|ACWST|ADDT|ADMT|ADT|AEDT|AEST|AFT|AHDT|AHST|AKDT|AKST|AKTST|AKTT|ALMST|ALMT|AMST|AMT|ANAST|ANAT|ANT|APT|AQTST|AQTT|ARST|ART|ASHST|ASHT|AST|AWDT|AWST|AWT|AZOMT|AZOST|AZOT|AZST|AZT|BAKST|BAKT|BDST|BDT|BEAT|BEAUT|BIOT|BMT|BNT|BORT|BOST|BOT|BRST|BRT|BST|BTT|BURT|CANT|CAPT|CAST|CAT|CAWT|CCT|CDDT|CDT|CEDT|CEMT|CEST|CET|CGST|CGT|CHADT|CHAST|CHDT|CHOST|CHOT|CIST|CKHST|CKT|CLST|CLT|CMT|COST|COT|CPT|CST|CUT|CVST|CVT|CWT|CXT|ChST|DACT|DAVT|DDUT|DFT|DMT|DUSST|DUST|EASST|EAST|EAT|ECT|EDDT|EDT|EEDT|EEST|EET|EGST|EGT|EHDT|EMT|EPT|EST|ET|EWT|FET|FFMT|FJST|FJT|FKST|FKT|FMT|FNST|FNT|FORT|FRUST|FRUT|GALT|GAMT|GBGT|GEST|GET|GFT|GHST|GILT|GIT|GMT|GST|GYT|HAA|HAC|HADT|HAE|HAP|HAR|HAST|HAT|HAY|HDT|HKST|HKT|HLV|HMT|HNA|HNC|HNE|HNP|HNR|HNT|HNY|HOVST|HOVT|HST|ICT|IDDT|IDT|IHST|IMT|IOT|IRDT|IRKST|IRKT|IRST|ISST|IST|JAVT|JCST|JDT|JMT|JST|JWST|KART|KDT|KGST|KGT|KIZST|KIZT|KMT|KOST|KRAST|KRAT|KST|KUYST|KUYT|KWAT|LHDT|LHST|LINT|LKT|LMT|LMT|LMT|LMT|LRT|LST|MADMT|MADST|MADT|MAGST|MAGT|MALST|MALT|MART|MAWT|MDDT|MDST|MDT|MEST|MET|MHT|MIST|MIT|MMT|MOST|MOT|MPT|MSD|MSK|MSM|MST|MUST|MUT|MVT|MWT|MYT|NCST|NCT|NDDT|NDT|NEGT|NEST|NET|NFT|NMT|NOVST|NOVT|NPT|NRT|NST|NT|NUT|NWT|NZDT|NZMT|NZST|OMSST|OMST|ORAST|ORAT|PDDT|PDT|PEST|PET|PETST|PETT|PGT|PHOT|PHST|PHT|PKST|PKT|PLMT|PMDT|PMMT|PMST|PMT|PNT|PONT|PPMT|PPT|PST|PT|PWT|PYST|PYT|QMT|QYZST|QYZT|RET|RMT|ROTT|SAKST|SAKT|SAMT|SAST|SBT|SCT|SDMT|SDT|SET|SGT|SHEST|SHET|SJMT|SLT|SMT|SRET|SRT|SST|STAT|SVEST|SVET|SWAT|SYOT|TAHT|TASST|TAST|TBIST|TBIT|TBMT|TFT|THA|TJT|TKT|TLT|TMT|TOST|TOT|TRST|TRT|TSAT|TVT|ULAST|ULAT|URAST|URAT|UTC|UYHST|UYST|UYT|UZST|UZT|VET|VLAST|VLAT|VOLST|VOLT|VOST|VUST|VUT|WARST|WART|WAST|WAT|WDT|WEDT|WEMT|WEST|WET|WFT|WGST|WGT|WIB|WIT|WITA|WMT|WSDT|WSST|WST|WT|XJT|YAKST|YAKT|YAPT|YDDT|YDT|YEKST|YEKST|YEKT|YEKT|YERST|YERT|YPT|YST|YWT|zzz"
## explicit north american timezones that get replaced
NA_TIMEZONES_PATTERN = "pacific|eastern|mountain|central"
Expand Down
6 changes: 6 additions & 0 deletions datefinder/local_parser_info.py
@@ -0,0 +1,6 @@
import calendar
from dateutil import parser

class LocaleParserInfo(parser.parserinfo):
WEEKDAYS = list(zip(calendar.day_abbr, calendar.day_name))
MONTHS = list(zip(calendar.month_abbr, calendar.month_name))[1:]