X-Git-Url: https://git.armaanb.net/?a=blobdiff_plain;f=extract.py;h=a76fbfbd991d599b789ef062097df750237bddf8;hb=3e4ee2d1a36d23079d68d3dda8b004bbd7cc8106;hp=65f99143e2fca0bd24dc65ea8927633451540413;hpb=c9ecbb88bfab85d5149d7b7681169d4855ebac4a;p=phrases.git diff --git a/extract.py b/extract.py index 65f9914..a76fbfb 100755 --- a/extract.py +++ b/extract.py @@ -4,20 +4,42 @@ from bs4 import BeautifulSoup import requests +import csv -url = 'https://en.wikipedia.org/wiki/List_of_Latin_phrases_(full)' -response = requests.get(url) -html = response.content - -soup = BeautifulSoup(html, "html.parser") -list_table = soup.find_all("table", attrs={"class":"wikitable"}) -f = open("phrases", "w") - -for table in list_table: - for row in table.find_all("tr")[1:]: - f.write("%" ) - cell = row.find_all("td") - for content in cell: - text = content.get_text() - f.write("\n" + text) -f.close() +def main(): + url = 'https://en.wikipedia.org/wiki/List_of_Latin_phrases_(full)' + response = requests.get(url) + html = response.content + + soup = BeautifulSoup(html, "html.parser") + list_table = soup.find_all("table", attrs={"class":"wikitable"}) + with open('phrases.csv', 'w') as f: + writer = csv.writer(f, lineterminator="\n") + + i = 0 # For the phrase id + + # write header + headers = ['id', 'Latin', 'English', 'Notes', 'Length'] + writer.writerow(headers) + + # iterate through the tables in the page + for table in list_table: + for row in table.tbody.find_all("tr", recursive=False): + cell = row.find_all("td", recursive=False) + rowc = [] + + rowc.append(i) # append phrase id + + # add cell content + for content in cell: + text = (content.get_text(" ", strip=True)).rstrip() + rowc.append(text) + + if len(rowc) > 1: + rowc.append(len(rowc[1])) + writer.writerow(rowc) + i = i + 1 + f.close() + +if __name__ == "__main__": + main()