]> git.armaanb.net Git - phrases.git/blob - extract.py
8f8877a8fd51968751022a80bc2272edb120de49
[phrases.git] / extract.py
1 #!/usr/bin/env python3
2 # Extract Latin famous phrases from wikipedia
3 # Armaan Bhojwani 2020
4
5 import argparse
6 import sqlite3
7 import sys
8 import requests
9 from bs4 import BeautifulSoup
10
11 def main(args=sys.argv[1:]):
12     # Argument parsing
13     parser = argparse.ArgumentParser(
14         description="Generate SQLite db of Latin famous phrases from Wikipedia.")
15     parser.add_argument("-o", "--output",
16                        default="phrases.db",
17                        help="set custom output file location")
18     args = parser.parse_args()
19
20     url = ("""https://en.wikipedia.org/w/index.php?title=List_of_Latin_phrases_(
21           full)&oldid=986793908""")
22     print("downloading webpage")
23     soup = BeautifulSoup(requests.get(url).content, "html.parser")
24
25     print("prepping database")
26     conn = sqlite3.connect(args.output)
27     c = conn.cursor()
28     c.execute("DROP TABLE IF EXISTS phrases")
29     c.execute("""CREATE TABLE phrases(
30               id INTEGER,
31               latin TEXT,
32               english TEXT,
33               notes TEXT,
34               length INTEGER)""")
35
36     i = 0 # For the phrase id
37
38     # iterate through the tables in the page
39     list_table = soup.find_all("table", attrs={"class":"wikitable"})
40     print("iterating through tables")
41     for table in list_table:
42         for row in table.tbody.find_all("tr", recursive=False):
43             cell = row.find_all("td", recursive=False)
44             if len(cell) > 2:
45                 print(i, end="\r")
46                 latin = (cell[0].get_text(" ", strip=True)).rstrip()
47                 english = (cell[1].get_text(" ", strip=True)).rstrip()
48                 notes = (cell[2].get_text(" ", strip=True)).rstrip()
49     
50                 c.execute("""INSERT INTO phrases (id, latin, english, notes, length)
51                           VALUES(?, ?, ?, ?, ?)""", (i, latin, english, notes, len(latin)))
52                 conn.commit()
53
54             i = i + 1
55
56     print("closing database")
57     c.close()
58     conn.close()
59
60 if __name__ == "__main__":
61     main()