X-Git-Url: https://git.armaanb.net/?a=blobdiff_plain;f=lightcards%2Fparse.py;h=082b7dce1dedf0fca6879bf91818915f39fb086f;hb=a32f80e459c2fa88cfde61fd80d1a8543cc51e8b;hp=d70c4e977935752853167a775dcc7f8aacb96dc6;hpb=23bedf0b57ac06fbee12af000f42c992d3227dd1;p=lightcards.git diff --git a/lightcards/parse.py b/lightcards/parse.py index d70c4e9..082b7dc 100644 --- a/lightcards/parse.py +++ b/lightcards/parse.py @@ -1,7 +1,6 @@ # Parse markdown table into tuple of lists # Armaan Bhojwani 2021 -import sys from bs4 import BeautifulSoup import markdown @@ -10,31 +9,49 @@ from .deck import Card def md2html(file): """Use the markdown module to convert input to HTML""" - try: - return markdown.markdown(open(file, "r").read(), extensions=['tables']) - except FileNotFoundError: - print(f"lightcards: \"{file}\": No such file or directory") - exit(1) + outp = "" + for i in file: + try: + outp += markdown.markdown( + open(i, "r").read(), extensions=["tables"] + ) + except FileNotFoundError: + raise Exception( + f'lightcards: "{i}": No such file or directory' + ) from None + return outp -def parse_html(html): + +def parse_html(html, args, conf): """Use BeautifulSoup to parse the HTML""" + def clean_text(inp): return inp.get_text().rstrip() - soup = BeautifulSoup(html, 'html.parser') - outp = [] - - for x in soup.find_all("tr"): - outp.append(Card([clean_text(y) for y in x.find_all("td")[:2]])) + soup = BeautifulSoup(html, "html.parser") + outp, ths = [], [] + + if args.table: + table_num = args.table + elif conf["table"]: + table_num = conf["table"] + else: + table_num = False + + for i, table in enumerate(soup.find_all("table"), start=1): + ths = table.find_all("th") + if len(ths) != 2: + if conf["lenient"] or not args.lenient: + raise Exception("lightcards: Headings malformed") + elif (table_num and i == table_num) or not table_num: + try: + for x in table.find_all("tr"): + y = x.find_all("td") + if y: + outp.append(Card(tuple([clean_text(z) for z in y]))) + except AttributeError: + raise Exception("lightcards: No table found") from None # Return a tuple of nested lists - return ([clean_text(x) for x in soup.find_all("th")][:2], outp[1:]) - - -def main(file): - return parse_html(md2html(file)) - - -if __name__ == "__main__": - print(main(sys.argv[1])) + return ([clean_text(x) for x in ths], outp)