X-Git-Url: https://git.armaanb.net/?a=blobdiff_plain;f=lightcards%2Fparse.py;h=8e0d320f6202f5e0bde347b540bcca36d02cc93e;hb=3ec382bac0913a7268e8059eaf337a54cf1b0f5c;hp=12c1073552082f147ef17539f23bba3d60a1aae7;hpb=ecd6aa920a6287905e86e3c98cbe6bacc6e8677c;p=lightcards.git diff --git a/lightcards/parse.py b/lightcards/parse.py old mode 100755 new mode 100644 index 12c1073..8e0d320 --- a/lightcards/parse.py +++ b/lightcards/parse.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # Parse markdown table into tuple of lists # Armaan Bhojwani 2021 @@ -6,33 +5,54 @@ import sys from bs4 import BeautifulSoup import markdown +from .deck import Card + def md2html(file): - with open(file, "r", encoding="utf-8") as input_file: - return markdown.markdown(input_file.read(), extensions=['tables']) + """Use the markdown module to convert input to HTML""" + outp = "" + for i in file: + try: + outp += markdown.markdown( + open(i, "r").read(), extensions=["tables"] + ) + except FileNotFoundError: + raise Exception( + f'lightcards: "{i}": No such file or directory' + ) from None + + return outp def parse_html(html): + """Use BeautifulSoup to parse the HTML""" + def clean_text(inp): return inp.get_text().rstrip() - def clean_list(inp): - for z in inp: - if not len(z) == 2: - inp.remove(z) - return inp - - soup = BeautifulSoup(html, 'html.parser') + soup = BeautifulSoup(html, "html.parser").find_all("table") outp = [] - for x in soup.find_all("tr"): - outp.append([clean_text(y) for y in x.find_all("td")]) + for table in soup: + try: + for x in table.find_all("tr"): + y = x.find_all("td") + if y: + outp.append(Card(tuple([clean_text(z) for z in y]))) + except AttributeError: + raise Exception("lightcards: No table found") from None + + ths = table.find_all("th") + if len(ths) != 2: + raise Exception("lightcards: Headings malformed") + + # Return a tuple of nested lists + return ([clean_text(x) for x in ths], outp) - return ([clean_text(x) for x in soup.find_all("th")], - clean_list(outp)) def main(file): return parse_html(md2html(file)) + if __name__ == "__main__": print(main(sys.argv[1]))