]> git.armaanb.net Git - phrases.git/blob - extract.py
Add --version argument to extract.py
[phrases.git] / extract.py
1 #!/usr/bin/env python3
2 # Extract Latin famous phrases from wikipedia
3 # Armaan Bhojwani 2020
4
5 import argparse
6 import sqlite3
7 import requests
8 from bs4 import BeautifulSoup
9
10 def parse_args():
11     parser = argparse.ArgumentParser(
12         description="Generate SQLite db of Latin famous phrases from Wikipedia.")
13     parser.add_argument("-o", "--output",
14                        default="phrases.db",
15                        help="set custom output file location")
16     parser.add_argument("-v", "--version",
17                         action="store_true",
18                         help="print script version")
19     return parser.parse_args()
20
21 def get_html(url):
22     print("downloading webpage")
23     return BeautifulSoup(requests.get(url).content, "html.parser")
24
25 def prep_database():
26     print("prepping database")
27     c.execute("DROP TABLE IF EXISTS phrases")
28     c.execute("""CREATE TABLE phrases(
29               id INTEGER,
30               latin TEXT,
31               english TEXT,
32               notes TEXT,
33               length INTEGER)""")
34
35 def fill_database(list_table):
36     i = 0 # phrase id
37     print("iterating through tables")
38     for table in list_table:
39         for row in table.tbody.find_all("tr", recursive=False):
40             cell = row.find_all("td", recursive=False)
41             if len(cell) > 2:
42                 print(i, end="\r")
43
44                 latin = (cell[0].get_text(" ", strip=True)).rstrip()
45                 english = (cell[1].get_text(" ", strip=True)).rstrip()
46                 notes = (cell[2].get_text(" ", strip=True)).rstrip()
47     
48                 c.execute("""INSERT INTO phrases
49                          (id, latin, english, notes, length)
50                          VALUES(?, ?, ?, ?, ?)""",
51                          (i, latin, english, notes, len(latin)))
52                 conn.commit()
53             i = i + 1
54
55 def get_tables():
56     url = ("""https://en.wikipedia.org/w/index.php?title=List_of_Latin_phrases_(
57           full)&oldid=986793908""")
58     return get_html(url).find_all("table", attrs={"class":"wikitable"})
59
60 def main():
61     if args.version:
62         print(version)
63     else:
64         prep_database()
65         fill_database(get_tables())
66
67 if __name__ == "__main__":
68     version = "phrases extract.py 1.0.1"
69     args = parse_args()
70     conn = sqlite3.connect(args.output)
71     c = conn.cursor()
72     main()