]> git.armaanb.net Git - phrases.git/blob - extract.py
Make manconvert smarter
[phrases.git] / extract.py
1 #!/usr/bin/env python3
2 # Extract Latin famous phrases from Wikipedia
3 # Armaan Bhojwani 2021
4
5 import argparse
6 import sqlite3
7 import requests
8 from bs4 import BeautifulSoup
9
10
11 def parse_args():
12     parser = argparse.ArgumentParser(
13         description="Generate database of Latin famous phrases from Wikipedia")
14     parser.add_argument("-o", "--output",
15                         default="phrases.db",
16                         help="set custom output file location")
17     parser.add_argument("-v", "--version",
18                         action="version",
19                         version="phrases-extract 1.0.3")
20     return parser.parse_args()
21
22
23 def get_html(url):
24     print("downloading webpage")
25     return BeautifulSoup(requests.get(url).content, "html.parser")
26
27
28 def prep_database():
29     print("prepping database")
30     c.execute("DROP TABLE IF EXISTS phrases")
31     c.execute("""CREATE TABLE phrases(
32               id INTEGER,
33               latin TEXT,
34               english TEXT,
35               notes TEXT,
36               length INTEGER)""")
37
38
39 def fill_database(list_table):
40     i = 0  # phrase id
41     print("iterating through tables")
42     for table in list_table:
43         for row in table.tbody.find_all("tr", recursive=False):
44             cell = row.find_all("td", recursive=False)
45             if len(cell) > 2:
46                 print(i, end="\r")
47
48                 latin = (cell[0].get_text(" ", strip=True)).rstrip()
49                 english = (cell[1].get_text(" ", strip=True)).rstrip()
50                 notes = (cell[2].get_text(" ", strip=True)).rstrip()
51
52                 c.execute("""INSERT INTO phrases
53                          (id, latin, english, notes, length)
54                          VALUES(?, ?, ?, ?, ?)""",
55                           (i, latin, english, notes, len(latin)))
56                 conn.commit()
57             i = i + 1
58
59
60 def get_tables():
61     url = ("""https://en.wikipedia.org/w/index.php?title=List_of_Latin_phrases_(
62           full)&oldid=986793908""")
63     return get_html(url).find_all("table", attrs={"class": "wikitable"})
64
65
66 def main():
67     prep_database()
68     fill_database(get_tables())
69
70
71 if __name__ == "__main__":
72     args = parse_args()
73     conn = sqlite3.connect(args.output)
74     c = conn.cursor()
75     main()