X-Git-Url: https://git.armaanb.net/?a=blobdiff_plain;f=extract.py;h=322a3020c3eccb18c9ad5c29b8ff9845f81771b8;hb=7903f060595381a5cf3c94d71722aba6be5b1629;hp=fa745300e64bdccdb8b1473f5bd4e218b8f8f57c;hpb=4436720b4b07ef11b97af993e62788992c2ec515;p=phrases.git diff --git a/extract.py b/extract.py index fa74530..322a302 100755 --- a/extract.py +++ b/extract.py @@ -1,21 +1,56 @@ #!/usr/bin/env python3 -# Tool to extract famous phrases from wikipedia +# Extract Latin famous phrases from wikipedia +# Armaan Bhojwani 2020 + from bs4 import BeautifulSoup import requests +import csv + +def main(): + url = 'https://en.wikipedia.org/wiki/List_of_Latin_phrases_(full)' + response = requests.get(url) + html = response.content + + soup = BeautifulSoup(html, "html.parser") + list_table = soup.find_all("table", attrs={"class":"wikitable"}) + with open('phrases.csv', 'w') as f: + writer = csv.writer(f) + + i = 0 # For the phrase id + # iterate through the tables in the page + for table in list_table: + for row in table.find_all("tr")[1:]: + cell = row.find_all("td") + rowc = [] + + # append phrase id + rowc.append(i) + + # avoid out of bounds errors + if len(cell) == 2: + lan = 2 + else: + lan = 3 + + # add cell content + for j in range (0, lan): + content = cell[j] + text=(content.get_text()).rstrip() + rowc.append(text) + + # append length of phrase + rowc.append(len(rowc[1])) + writer.writerow(rowc) + i = i + 1 + f.close() + + # Strip empty lines + with open('phrases.csv', 'r+') as f: + lines = f.readlines() + f.seek(0) + f.writelines(line for line in lines if line.strip()) + f.truncate() + f.close() -url = 'https://en.wikipedia.org/wiki/List_of_Latin_phrases_(full)' -response = requests.get(url) -html = response.content - -soup = BeautifulSoup(html, "html.parser") -list_table = soup.find_all("table", attrs={"class":"wikitable"}) -f = open("phrases", "w") - -for table in list_table: - for row in table.find_all("tr")[1:]: - f.write("%" ) - cell = row.find_all("td") - for content in cell: - text = content.get_text() - f.write("\n" + text) -f.close() +if __name__ == "__main__": + main()