]> git.armaanb.net Git - phrases.git/blob - extract.py
add argument parsing, fix program logic
[phrases.git] / extract.py
1 #!/usr/bin/env python3
2 # Extract Latin famous phrases from wikipedia
3 # Armaan Bhojwani 2020
4
5 import argparse
6 import sys
7 import csv
8 import requests
9 from bs4 import BeautifulSoup
10
11 def main(args=sys.argv[1:]):
12     # Argument parsing
13     parser = argparse.ArgumentParser(
14         description="Generate CSV file of Latin famous phrases from Wikipedia.")
15     parser.add_argument("-o", "--output",
16                        default="phrases.csv",
17                        help="set custom output file location")
18     args = parser.parse_args()
19
20     url = ('https://en.wikipedia.org/w/index.php?title=List_of_Latin_phrases_('
21           'full)&oldid=986793908')
22     soup = BeautifulSoup(requests.get(url).content, "html.parser")
23     i = 0 # For the phrase id
24
25     with open(args.output, 'w') as f:
26         writer = csv.writer(f, lineterminator="\n")
27
28         # write header
29         headers = ['id', 'Latin', 'English', 'Notes', 'Length']
30         writer.writerow(headers)
31
32         # iterate through the tables in the page
33         list_table = soup.find_all("table", attrs={"class":"wikitable"})
34         for table in list_table:
35             for row in table.tbody.find_all("tr", recursive=False):
36                 cell = row.find_all("td", recursive=False)
37                 rowc = []
38
39                 rowc.append(i) # append phrase id
40
41                 # add cell content
42                 for content in cell:
43                     text = (content.get_text(" ", strip=True)).rstrip()
44                     rowc.append(text)
45
46                 if len(rowc) > 1:
47                     rowc.append(len(rowc[1]))
48                     writer.writerow(rowc)
49                 i = i + 1
50     f.close()
51
52 if __name__ == "__main__":
53     main()