From 29b371bea92e552b3366ba84cfe231d4d2831f9d Mon Sep 17 00:00:00 2001 From: Armaan Bhojwani Date: Mon, 25 Jan 2021 10:52:08 -0500 Subject: [PATCH] Move parsing from lightcards.py to parse.py --- src/parse.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100755 src/parse.py diff --git a/src/parse.py b/src/parse.py new file mode 100755 index 0000000..f965516 --- /dev/null +++ b/src/parse.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# Parse markdown table into tuple of lists +# Armaan Bhojwani 2021 + +import sys +from bs4 import BeautifulSoup +import markdown + + +def md2html(file): + with open(file, "r", encoding="utf-8") as input_file: + return markdown.markdown(input_file.read(), extensions=['tables']) + + +def parse_html(html): + def clean_text(inp): + return inp.get_text().rstrip() + + def clean_list(inp): + for z in inp: + if not len(z) == 2: + inp.remove(z) + return inp + + soup = BeautifulSoup(html, 'html.parser') + outp = [] + + for x in soup.find_all("tr"): + outp.append([clean_text(y) for y in x.find_all("td")]) + + return (clean_list([clean_text(x) for x in soup.find_all("th")]), + clean_list(outp)) + +def main(file): + return parse_html(md2html(file)) + +if __name__ == "__main__": + print(main(sys.argv[1])) -- 2.39.2