Skip to content

extracts structured data from text using user-defined delimiters (strings or regex)

License

Notifications You must be signed in to change notification settings

hansalemaos/parifinder

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

2 Commits
 
 
 
 
 
 
 
 
 
 

Repository files navigation

extracts structured data from text using user-defined delimiters (strings or regex)

Tested against Windows / Python 3.11 / Anaconda

pip install parifinder

parifinder extracts structured data from text using user-defined delimiters (strings or regex), making it versatile for data processing.

Advantages

Flexibility:

The function can handle a wide range of scenarios, making it versatile for parsing text with various delimiters. It can handle both single and multiple pairs of delimiters, whether they are simple strings or complex regular expressions. This flexibility makes it suitable for different use cases.

Scalability:

It can parse multiple pairs of delimiters within a given text, which is especially useful when dealing with documents or data containing nested elements.

Pure Python:

It uses only Python's standard library

from parifinder import parse_pairs
from pprint import pprint

text_0 = """[[1, 2, 2], [5], [2, 3]], 12: [[4, 4, 4], [12, 0], [6, 6]], 3: [[1, 2]][[1, 2, 2], [5], [2, 3]], 12: [[4, 4, 4], [12, 0], [6, 6]], 3: [[1, 2]]"""
s1_0 = "["
s2_0 = "]"
r0 = parse_pairs(string=text_0, s1=s1_0, s2=s2_0, str_regex=False)
print("r0-----------------------------------------------------------------")
pprint(r0, indent=1, width=1)

text_1 = "<body><p>a</p><p>a</p><p>The HTML <code>button</code> tag defines a clickable button.</p><p>x</p><p>The CSS <code>background-color</code> property defines the background color of an element.</p></body></html>"
s1_1 = "<p>"
s2_1 = "</p>"
r1 = parse_pairs(string=text_1, s1=s1_1, s2=s2_1, str_regex=False)
print("r1-----------------------------------------------------------------")
pprint(r1, indent=1, width=1)

text_2 = "[1bla[2bla/2]/1]"
s1_2 = r"\[\d"
s2_2 = r"/\d]"
r2 = parse_pairs(string=text_2, s1=s1_2, s2=s2_2, str_regex=True)
print("r2-----------------------------------------------------------------")
pprint(r2, indent=1, width=1)

text_3 = "[1bla[2bla/2]/1]"
s1_3 = [("[1", "/1]"), ("[2", "/2]")]
s2_3 = None
r3 = parse_pairs(string=text_3, s1=s1_3, s2=s2_3, str_regex=False)
print("r3-----------------------------------------------------------------")
pprint(r3, indent=1, width=1)

text_4 = "[1bla[2bla/2]/1]"
s1_4 = ["[1", "[2"]
s2_4 = ["/1]", "/2]"]
r4 = parse_pairs(string=text_4, s1=s1_4, s2=s2_4, str_regex=False)
print("r4-----------------------------------------------------------------")
pprint(r4, indent=1, width=1)


# r0-----------------------------------------------------------------
# {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23): {'children': [(1,
#                                                                                                         2,
#                                                                                                         3,
#                                                                                                         4,
#                                                                                                         5,
#                                                                                                         6,
#                                                                                                         7,
#                                                                                                         8,
#                                                                                                         9),
#                                                                                                        (17,
#                                                                                                         18,
#                                                                                                         19,
#                                                                                                         20,
#                                                                                                         21,
#                                                                                                         22),
#                                                                                                        (12,
#                                                                                                         13,
#                                                                                                         14)],
#                                                                                           'end': 23,
#                                                                                           'parents': [],
#                                                                                           'size': 23,
#                                                                                           'start': 0,
#                                                                                           'text': '[[1, '
#                                                                                                   '2, '
#                                                                                                   '2], '
#                                                                                                   '[5], '
#                                                                                                   '[2, '
#                                                                                                   '3]]'},
#  (1, 2, 3, 4, 5, 6, 7, 8, 9): {'children': [],
#                                'end': 9,
#                                'parents': [(0,
#                                             1,
#                                             2,
#                                             3,
#                                             4,
#                                             5,
#                                             6,
#                                             7,
#                                             8,
#                                             9,
#                                             10,
#                                             11,
#                                             12,
#                                             13,
#                                             14,
#                                             15,
#                                             16,
#                                             17,
#                                             18,
#                                             19,
#                                             20,
#                                             21,
#                                             22,
#                                             23)],
#                                'size': 8,
#                                'start': 1,
#                                'text': '[1, '
#                                        '2, '
#                                        '2]'},
#  (12, 13, 14): {'children': [],
#                 'end': 14,
#                 'parents': [(0,
#                              1,
#                              2,
#                              3,
#                              4,
#                              5,
#                              6,
#                              7,
#                              8,
#                              9,
#                              10,
#                              11,
#                              12,
#                              13,
#                              14,
#                              15,
#                              16,
#                              17,
#                              18,
#                              19,
#                              20,
#                              21,
#                              22,
#                              23)],
#                 'size': 2,
#                 'start': 12,
#                 'text': '[5]'},
#  (17, 18, 19, 20, 21, 22): {'children': [],
#                             'end': 22,
#                             'parents': [(0,
#                                          1,
#                                          2,
#                                          3,
#                                          4,
#                                          5,
#                                          6,
#                                          7,
#                                          8,
#                                          9,
#                                          10,
#                                          11,
#                                          12,
#                                          13,
#                                          14,
#                                          15,
#                                          16,
#                                          17,
#                                          18,
#                                          19,
#                                          20,
#                                          21,
#                                          22,
#                                          23)],
#                             'size': 5,
#                             'start': 17,
#                             'text': '[2, '
#                                     '3]'},
#  (30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57): {'children': [(31,
#                                                                                                                                   32,
#                                                                                                                                   33,
#                                                                                                                                   34,
#                                                                                                                                   35,
#                                                                                                                                   36,
#                                                                                                                                   37,
#                                                                                                                                   38,
#                                                                                                                                   39),
#                                                                                                                                  (42,
#                                                                                                                                   43,
#                                                                                                                                   44,
#                                                                                                                                   45,
#                                                                                                                                   46,
#                                                                                                                                   47,
#                                                                                                                                   48),
#                                                                                                                                  (51,
#                                                                                                                                   52,
#                                                                                                                                   53,
#                                                                                                                                   54,
#                                                                                                                                   55,
#                                                                                                                                   56)],
#                                                                                                                     'end': 57,
#                                                                                                                     'parents': [],
#                                                                                                                     'size': 27,
#                                                                                                                     'start': 30,
#                                                                                                                     'text': '[[4, '
#                                                                                                                             '4, '
#                                                                                                                             '4], '
#                                                                                                                             '[12, '
#                                                                                                                             '0], '
#                                                                                                                             '[6, '
#                                                                                                                             '6]]'},
#  (31, 32, 33, 34, 35, 36, 37, 38, 39): {'children': [],
#                                         'end': 39,
#                                         'parents': [(30,
#                                                      31,
#                                                      32,
#                                                      33,
#                                                      34,
#                                                      35,
#                                                      36,
#                                                      37,
#                                                      38,
#                                                      39,
#                                                      40,
#                                                      41,
#                                                      42,
#                                                      43,
#                                                      44,
#                                                      45,
#                                                      46,
#                                                      47,
#                                                      48,
#                                                      49,
#                                                      50,
#                                                      51,
#                                                      52,
#                                                      53,
#                                                      54,
#                                                      55,
#                                                      56,
#                                                      57)],
#                                         'size': 8,
#                                         'start': 31,
#                                         'text': '[4, '
#                                                 '4, '
#                                                 '4]'},
#  (42, 43, 44, 45, 46, 47, 48): {'children': [],
#                                 'end': 48,
#                                 'parents': [(30,
#                                              31,
#                                              32,
#                                              33,
#                                              34,
#                                              35,
#                                              36,
#                                              37,
#                                              38,
#                                              39,
#                                              40,
#                                              41,
#                                              42,
#                                              43,
#                                              44,
#                                              45,
#                                              46,
#                                              47,
#                                              48,
#                                              49,
#                                              50,
#                                              51,
#                                              52,
#                                              53,
#                                              54,
#                                              55,
#                                              56,
#                                              57)],
#                                 'size': 6,
#                                 'start': 42,
#                                 'text': '[12, '
#                                         '0]'},
#  (51, 52, 53, 54, 55, 56): {'children': [],
#                             'end': 56,
#                             'parents': [(30,
#                                          31,
#                                          32,
#                                          33,
#                                          34,
#                                          35,
#                                          36,
#                                          37,
#                                          38,
#                                          39,
#                                          40,
#                                          41,
#                                          42,
#                                          43,
#                                          44,
#                                          45,
#                                          46,
#                                          47,
#                                          48,
#                                          49,
#                                          50,
#                                          51,
#                                          52,
#                                          53,
#                                          54,
#                                          55,
#                                          56,
#                                          57)],
#                             'size': 5,
#                             'start': 51,
#                             'text': '[6, '
#                                     '6]'},
#  (63, 64, 65, 66, 67, 68, 69, 70): {'children': [(64,
#                                                   65,
#                                                   66,
#                                                   67,
#                                                   68,
#                                                   69)],
#                                     'end': 70,
#                                     'parents': [],
#                                     'size': 7,
#                                     'start': 63,
#                                     'text': '[[1, '
#                                             '2]]'},
#  (64, 65, 66, 67, 68, 69): {'children': [],
#                             'end': 69,
#                             'parents': [(63,
#                                          64,
#                                          65,
#                                          66,
#                                          67,
#                                          68,
#                                          69,
#                                          70)],
#                             'size': 5,
#                             'start': 64,
#                             'text': '[1, '
#                                     '2]'},
#  (71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94): {'children': [(72,
#                                                                                                                   73,
#                                                                                                                   74,
#                                                                                                                   75,
#                                                                                                                   76,
#                                                                                                                   77,
#                                                                                                                   78,
#                                                                                                                   79,
#                                                                                                                   80),
#                                                                                                                  (88,
#                                                                                                                   89,
#                                                                                                                   90,
#                                                                                                                   91,
#                                                                                                                   92,
#                                                                                                                   93),
#                                                                                                                  (83,
#                                                                                                                   84,
#                                                                                                                   85)],
#                                                                                                     'end': 94,
#                                                                                                     'parents': [],
#                                                                                                     'size': 23,
#                                                                                                     'start': 71,
#                                                                                                     'text': '[[1, '
#                                                                                                             '2, '
#                                                                                                             '2], '
#                                                                                                             '[5], '
#                                                                                                             '[2, '
#                                                                                                             '3]]'},
#  (72, 73, 74, 75, 76, 77, 78, 79, 80): {'children': [],
#                                         'end': 80,
#                                         'parents': [(71,
#                                                      72,
#                                                      73,
#                                                      74,
#                                                      75,
#                                                      76,
#                                                      77,
#                                                      78,
#                                                      79,
#                                                      80,
#                                                      81,
#                                                      82,
#                                                      83,
#                                                      84,
#                                                      85,
#                                                      86,
#                                                      87,
#                                                      88,
#                                                      89,
#                                                      90,
#                                                      91,
#                                                      92,
#                                                      93,
#                                                      94)],
#                                         'size': 8,
#                                         'start': 72,
#                                         'text': '[1, '
#                                                 '2, '
#                                                 '2]'},
#  (83, 84, 85): {'children': [],
#                 'end': 85,
#                 'parents': [(71,
#                              72,
#                              73,
#                              74,
#                              75,
#                              76,
#                              77,
#                              78,
#                              79,
#                              80,
#                              81,
#                              82,
#                              83,
#                              84,
#                              85,
#                              86,
#                              87,
#                              88,
#                              89,
#                              90,
#                              91,
#                              92,
#                              93,
#                              94)],
#                 'size': 2,
#                 'start': 83,
#                 'text': '[5]'},
#  (88, 89, 90, 91, 92, 93): {'children': [],
#                             'end': 93,
#                             'parents': [(71,
#                                          72,
#                                          73,
#                                          74,
#                                          75,
#                                          76,
#                                          77,
#                                          78,
#                                          79,
#                                          80,
#                                          81,
#                                          82,
#                                          83,
#                                          84,
#                                          85,
#                                          86,
#                                          87,
#                                          88,
#                                          89,
#                                          90,
#                                          91,
#                                          92,
#                                          93,
#                                          94)],
#                             'size': 5,
#                             'start': 88,
#                             'text': '[2, '
#                                     '3]'},
#  (101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128): {'children': [(102,
#                                                                                                                                                               103,
#                                                                                                                                                               104,
#                                                                                                                                                               105,
#                                                                                                                                                               106,
#                                                                                                                                                               107,
#                                                                                                                                                               108,
#                                                                                                                                                               109,
#                                                                                                                                                               110),
#                                                                                                                                                              (113,
#                                                                                                                                                               114,
#                                                                                                                                                               115,
#                                                                                                                                                               116,
#                                                                                                                                                               117,
#                                                                                                                                                               118,
#                                                                                                                                                               119),
#                                                                                                                                                              (122,
#                                                                                                                                                               123,
#                                                                                                                                                               124,
#                                                                                                                                                               125,
#                                                                                                                                                               126,
#                                                                                                                                                               127)],
#                                                                                                                                                 'end': 128,
#                                                                                                                                                 'parents': [],
#                                                                                                                                                 'size': 27,
#                                                                                                                                                 'start': 101,
#                                                                                                                                                 'text': '[[4, '
#                                                                                                                                                         '4, '
#                                                                                                                                                         '4], '
#                                                                                                                                                         '[12, '
#                                                                                                                                                         '0], '
#                                                                                                                                                         '[6, '
#                                                                                                                                                         '6]]'},
#  (102, 103, 104, 105, 106, 107, 108, 109, 110): {'children': [],
#                                                  'end': 110,
#                                                  'parents': [(101,
#                                                               102,
#                                                               103,
#                                                               104,
#                                                               105,
#                                                               106,
#                                                               107,
#                                                               108,
#                                                               109,
#                                                               110,
#                                                               111,
#                                                               112,
#                                                               113,
#                                                               114,
#                                                               115,
#                                                               116,
#                                                               117,
#                                                               118,
#                                                               119,
#                                                               120,
#                                                               121,
#                                                               122,
#                                                               123,
#                                                               124,
#                                                               125,
#                                                               126,
#                                                               127,
#                                                               128)],
#                                                  'size': 8,
#                                                  'start': 102,
#                                                  'text': '[4, '
#                                                          '4, '
#                                                          '4]'},
#  (113, 114, 115, 116, 117, 118, 119): {'children': [],
#                                        'end': 119,
#                                        'parents': [(101,
#                                                     102,
#                                                     103,
#                                                     104,
#                                                     105,
#                                                     106,
#                                                     107,
#                                                     108,
#                                                     109,
#                                                     110,
#                                                     111,
#                                                     112,
#                                                     113,
#                                                     114,
#                                                     115,
#                                                     116,
#                                                     117,
#                                                     118,
#                                                     119,
#                                                     120,
#                                                     121,
#                                                     122,
#                                                     123,
#                                                     124,
#                                                     125,
#                                                     126,
#                                                     127,
#                                                     128)],
#                                        'size': 6,
#                                        'start': 113,
#                                        'text': '[12, '
#                                                '0]'},
#  (122, 123, 124, 125, 126, 127): {'children': [],
#                                   'end': 127,
#                                   'parents': [(101,
#                                                102,
#                                                103,
#                                                104,
#                                                105,
#                                                106,
#                                                107,
#                                                108,
#                                                109,
#                                                110,
#                                                111,
#                                                112,
#                                                113,
#                                                114,
#                                                115,
#                                                116,
#                                                117,
#                                                118,
#                                                119,
#                                                120,
#                                                121,
#                                                122,
#                                                123,
#                                                124,
#                                                125,
#                                                126,
#                                                127,
#                                                128)],
#                                   'size': 5,
#                                   'start': 122,
#                                   'text': '[6, '
#                                           '6]'},
#  (134, 135, 136, 137, 138, 139, 140, 141): {'children': [(135,
#                                                           136,
#                                                           137,
#                                                           138,
#                                                           139,
#                                                           140)],
#                                             'end': 141,
#                                             'parents': [],
#                                             'size': 7,
#                                             'start': 134,
#                                             'text': '[[1, '
#                                                     '2]]'},
#  (135, 136, 137, 138, 139, 140): {'children': [],
#                                   'end': 140,
#                                   'parents': [(134,
#                                                135,
#                                                136,
#                                                137,
#                                                138,
#                                                139,
#                                                140,
#                                                141)],
#                                   'size': 5,
#                                   'start': 135,
#                                   'text': '[1, '
#                                           '2]'}}
# r1-----------------------------------------------------------------
# {(6, 7, 8, 9, 10, 11, 12, 13, 14): {'children': [],
#                                     'end': 14,
#                                     'parents': [],
#                                     'size': 9,
#                                     'start': 6,
#                                     'text': '<p>a</p>'},
#  (14, 15, 16, 17, 18, 19, 20, 21, 22): {'children': [],
#                                         'end': 22,
#                                         'parents': [],
#                                         'size': 9,
#                                         'start': 14,
#                                         'text': '<p>a</p>'},
#  (22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89): {'children': [],
#                                                                                                                                                                                                                                                                                     'end': 89,
#                                                                                                                                                                                                                                                                                     'parents': [],
#                                                                                                                                                                                                                                                                                     'size': 68,
#                                                                                                                                                                                                                                                                                     'start': 22,
#                                                                                                                                                                                                                                                                                     'text': '<p>The '
#                                                                                                                                                                                                                                                                                             'HTML '
#                                                                                                                                                                                                                                                                                             '<code>button</code> '
#                                                                                                                                                                                                                                                                                             'tag '
#                                                                                                                                                                                                                                                                                             'defines '
#                                                                                                                                                                                                                                                                                             'a '
#                                                                                                                                                                                                                                                                                             'clickable '
#                                                                                                                                                                                                                                                                                             'button.</p>'},
#  (89, 90, 91, 92, 93, 94, 95, 96, 97): {'children': [],
#                                         'end': 97,
#                                         'parents': [],
#                                         'size': 9,
#                                         'start': 89,
#                                         'text': '<p>x</p>'},
#  (97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194): {'children': [],
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            'end': 194,
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            'parents': [],
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            'size': 98,
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            'start': 97,
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            'text': '<p>The '
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    'CSS '
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    '<code>background-color</code> '
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    'property '
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    'defines '
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    'the '
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    'background '
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    'color '
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    'of '
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    'an '
#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    'element.</p>'}}
# r2-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
#                                                                               'end': 16,
#                                                                               'parents': [],
#                                                                               'size': 17,
#                                                                               'start': 0,
#                                                                               'text': '[1bla[2bla/2]/1]'}},
#  ('[1', '/2]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
#                                                                   'end': 13,
#                                                                   'parents': [],
#                                                                   'size': 14,
#                                                                   'start': 0,
#                                                                   'text': '[1bla[2bla/2]'}},
#  ('[2', '/1]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
#                                                                'end': 16,
#                                                                'parents': [],
#                                                                'size': 12,
#                                                                'start': 5,
#                                                                'text': '[2bla/2]/1]'}},
#  ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
#                                                    'end': 13,
#                                                    'parents': [],
#                                                    'size': 9,
#                                                    'start': 5,
#                                                    'text': '[2bla/2]'}}}
# r3-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
#                                                                               'end': 16,
#                                                                               'parents': [],
#                                                                               'size': 17,
#                                                                               'start': 0,
#                                                                               'text': '[1bla[2bla/2]/1]'}},
#  ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
#                                                    'end': 13,
#                                                    'parents': [],
#                                                    'size': 9,
#                                                    'start': 5,
#                                                    'text': '[2bla/2]'}}}
# r4-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
#                                                                               'end': 16,
#                                                                               'parents': [],
#                                                                               'size': 17,
#                                                                               'start': 0,
#                                                                               'text': '[1bla[2bla/2]/1]'}},
#  ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
#                                                    'end': 13,
#                                                    'parents': [],
#                                                    'size': 9,
#                                                    'start': 5,
#                                                    'text': '[2bla/2]'}}}