X-Git-Url: https://git.armaanb.net/?a=blobdiff_plain;f=extract.py;h=a76fbfbd991d599b789ef062097df750237bddf8;hb=3e4ee2d1a36d23079d68d3dda8b004bbd7cc8106;hp=a454569a2eebff59cb883c512b389bd7bb98434b;hpb=ce91c175943f73684423627c098ae3d7f4fa94ab;p=phrases.git diff --git a/extract.py b/extract.py index a454569..a76fbfb 100755 --- a/extract.py +++ b/extract.py @@ -1,23 +1,45 @@ #!/usr/bin/env python3 -# Tool to extract famous phrases from wikipedia +# Extract Latin famous phrases from wikipedia +# Armaan Bhojwani 2020 + from bs4 import BeautifulSoup import requests +import csv + +def main(): + url = 'https://en.wikipedia.org/wiki/List_of_Latin_phrases_(full)' + response = requests.get(url) + html = response.content + + soup = BeautifulSoup(html, "html.parser") + list_table = soup.find_all("table", attrs={"class":"wikitable"}) + with open('phrases.csv', 'w') as f: + writer = csv.writer(f, lineterminator="\n") + + i = 0 # For the phrase id -url = 'https://en.wikipedia.org/wiki/List_of_Latin_phrases_(full)' -response = requests.get(url) -html = response.content + # write header + headers = ['id', 'Latin', 'English', 'Notes', 'Length'] + writer.writerow(headers) -soup = BeautifulSoup(html, "html.parser") -list_table = soup.find_all("table", attrs={"class":"wikitable"}) + # iterate through the tables in the page + for table in list_table: + for row in table.tbody.find_all("tr", recursive=False): + cell = row.find_all("td", recursive=False) + rowc = [] -output = [] + rowc.append(i) # append phrase id -for table in list_table: - for row in table.find_all("tr")[1:]: - cell = row.find_all("td") - for content in cell: - text = content.get_text() - output.append(text) + # add cell content + for content in cell: + text = (content.get_text(" ", strip=True)).rstrip() + rowc.append(text) -print(output) + if len(rowc) > 1: + rowc.append(len(rowc[1])) + writer.writerow(rowc) + i = i + 1 + f.close() +if __name__ == "__main__": + main()