-
Notifications
You must be signed in to change notification settings - Fork 72
/
markdown.py
135 lines (96 loc) · 3.52 KB
/
markdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from pathlib import Path
from typing import Iterator, NamedTuple, Optional
from ..common import get_logger, Extraction, Url, PathIsh, Res, Visit, Loc, file_mtime, logger
import mistletoe # type: ignore
from mistletoe.span_token import AutoLink, Link # type: ignore
import mistletoe.block_token as BT # type: ignore
from mistletoe.html_renderer import HTMLRenderer # type: ignore
renderer = HTMLRenderer()
block_tokens = tuple(getattr(BT, name) for name in BT.__all__)
class Parsed(NamedTuple):
url: Url
context: Optional[str]
Result = Res[Parsed]
# the fuck...
#
# from mistletoe import Document
# d = Document('''
# # heading
# ## sub
# ## sub2
# ''')
# d.children[0].content
# Out[13]: 'sub2'
# meh, but for now fine I guess
HTML_MARKER = '!html '
def _ashtml(block) -> str:
res = renderer.render(block)
if res.startswith('<p>') and res.endswith('</p>'):
res = res[3: -4] # meh, but for now fine
return res
class Parser:
def __init__(self, path: Path):
self.doc = mistletoe.Document(path.read_text())
def _extract(self, cur, last_block) -> Iterator[Parsed]:
if not isinstance(cur, (AutoLink, Link)):
# hopefully that's all??
return
url = cur.target
# TODO fuck. it doesn't preserve line numbers/positions in text???
# ugh. It can't output markdown.. https://github.com/miyuchina/mistletoe/issues/4
context = None if last_block is None else HTML_MARKER + _ashtml(last_block)
yield Parsed(url=url, context=context)
def _walk(self, cur, last_block) -> Iterator[Result]:
if isinstance(cur, block_tokens):
last_block = cur
try:
yield from self._extract(cur, last_block)
except Exception as e:
logger.exception(e)
yield e
children = getattr(cur, 'children', [])
for c in children:
yield from self._walk(c, last_block=last_block)
def walk(self):
yield from self._walk(self.doc, last_block=None)
def extract_from_file(fname: PathIsh) -> Iterator[Extraction]:
path = Path(fname)
fallback_dt = file_mtime(path)
p = Parser(path)
for r in p.walk():
if isinstance(r, Exception):
yield r
else:
yield Visit(
url=r.url,
dt=fallback_dt,
locator=Loc.file(fname), # TODO line number
context=r.context,
)
class TextParser(Parser):
'''
Used to extract links/render markdown from text, e.g. reddit/github comments
Instead of chunking blocks like for files, this returns the entire
message rendered as the context
'''
def __init__(self, text: str):
self.doc = mistletoe.Document(text)
def _doc_ashtml(self):
'''
cached html representation of the entire html message/document
'''
if not hasattr(self, '_html'):
self._html = HTML_MARKER + _ashtml(self.doc)
return self._html
def _extract(self, cur, last_block = None) -> Iterator[Parsed]:
if not isinstance(cur, (AutoLink, Link)):
return
yield Parsed(url=cur.target, context=self._doc_ashtml())
def extract_from_text(text: str) -> Iterator[Result]:
'''
assume this is rendering something like a github/reddit markdown message
use the entire contents of the comment/body as the context
'''
# note: returns Result (link/context), not Visit
# the callee function has to insert dt/duration etc.
yield from TextParser(text).walk()