-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathcrawlprep.py
57 lines (46 loc) · 1.65 KB
/
crawlprep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/env python3
"convert html to documents that can be ingested into cloudsearch"
import json
from contextlib import contextmanager
from io import StringIO
import os
import lxml.html
import sh
@contextmanager
def cd(newdir):
prevdir = os.getcwd()
os.chdir(os.path.expanduser(newdir))
try:
yield
finally:
os.chdir(prevdir)
def main():
"do the work"
outdir = os.getcwd() # TODO parameterize
outfile = os.path.join(outdir, "output.json")
outlist = []
with cd("_site"): # TODO unhardcode
for root, dirs, files in os.walk(".", topdown=False):
for name in files:
if name.endswith(".html"):
filename = os.path.join(root, name)
url = filename.lstrip(".")
print(url)
doc = lxml.html.parse(filename)
title = doc.find(".//title").text.replace(" - Fred Hutch Biomedical Data Science Wiki", "")
# print(title)
outio = StringIO()
sh.pandoc("-f", "html", "-t", "plain", filename, _out=outio)
# print(outio.getvalue())
outdict=dict(type="add",
id=url,
fields=dict(title=title, content=outio.getvalue()))
outlist.append(outdict)
# with open(outfile, "w") as outfh:
# json.dump(outdict, outfh)
# for name in dirs:
# print(os.path.join(root, name))
with open(outfile, "w") as outfh:
json.dump(outlist, outfh)
if __name__ == "__main__":
main()