]> git.armaanb.net Git - lightcards.git/blob - lightcards/parse.py
Add setup.py, and make into actual Python module
[lightcards.git] / lightcards / parse.py
1 #!/usr/bin/env python
2 # Parse markdown table into tuple of lists
3 # Armaan Bhojwani 2021
4
5 import sys
6 from bs4 import BeautifulSoup
7 import markdown
8
9
10 def md2html(file):
11     with open(file, "r", encoding="utf-8") as input_file:
12         return markdown.markdown(input_file.read(), extensions=['tables'])
13
14
15 def parse_html(html):
16     def clean_text(inp):
17         return inp.get_text().rstrip()
18
19     def clean_list(inp):
20         for z in inp:
21             if not len(z) == 2:
22                 inp.remove(z)
23         return inp
24
25     soup = BeautifulSoup(html, 'html.parser')
26     outp = []
27
28     for x in soup.find_all("tr"):
29         outp.append([clean_text(y) for y in x.find_all("td")])
30
31     return ([clean_text(x) for x in soup.find_all("th")],
32             clean_list(outp))
33
34 def main(file):
35     return parse_html(md2html(file))
36
37 if __name__ == "__main__":
38     print(main(sys.argv[1]))