]> git.armaanb.net Git - lightcards.git/commitdiff
Move parsing from lightcards.py to parse.py
authorArmaan Bhojwani <me@armaanb.net>
Mon, 25 Jan 2021 15:52:08 +0000 (10:52 -0500)
committerArmaan Bhojwani <me@armaanb.net>
Mon, 25 Jan 2021 15:52:08 +0000 (10:52 -0500)
src/parse.py [new file with mode: 0755]

diff --git a/src/parse.py b/src/parse.py
new file mode 100755 (executable)
index 0000000..f965516
--- /dev/null
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# Parse markdown table into tuple of lists
+# Armaan Bhojwani 2021
+
+import sys
+from bs4 import BeautifulSoup
+import markdown
+
+
+def md2html(file):
+    with open(file, "r", encoding="utf-8") as input_file:
+        return markdown.markdown(input_file.read(), extensions=['tables'])
+
+
+def parse_html(html):
+    def clean_text(inp):
+        return inp.get_text().rstrip()
+
+    def clean_list(inp):
+        for z in inp:
+            if not len(z) == 2:
+                inp.remove(z)
+        return inp
+
+    soup = BeautifulSoup(html, 'html.parser')
+    outp = []
+
+    for x in soup.find_all("tr"):
+        outp.append([clean_text(y) for y in x.find_all("td")])
+
+    return (clean_list([clean_text(x) for x in soup.find_all("th")]),
+            clean_list(outp))
+
+def main(file):
+    return parse_html(md2html(file))
+
+if __name__ == "__main__":
+    print(main(sys.argv[1]))