X-Git-Url: https://git.armaanb.net/?p=phrases.git;a=blobdiff_plain;f=extract.py;h=a76fbfbd991d599b789ef062097df750237bddf8;hp=322a3020c3eccb18c9ad5c29b8ff9845f81771b8;hb=3e4ee2d1a36d23079d68d3dda8b004bbd7cc8106;hpb=7903f060595381a5cf3c94d71722aba6be5b1629 diff --git a/extract.py b/extract.py index 322a302..a76fbfb 100755 --- a/extract.py +++ b/extract.py @@ -14,43 +14,32 @@ def main(): soup = BeautifulSoup(html, "html.parser") list_table = soup.find_all("table", attrs={"class":"wikitable"}) with open('phrases.csv', 'w') as f: - writer = csv.writer(f) + writer = csv.writer(f, lineterminator="\n") i = 0 # For the phrase id + + # write header + headers = ['id', 'Latin', 'English', 'Notes', 'Length'] + writer.writerow(headers) + # iterate through the tables in the page for table in list_table: - for row in table.find_all("tr")[1:]: - cell = row.find_all("td") + for row in table.tbody.find_all("tr", recursive=False): + cell = row.find_all("td", recursive=False) rowc = [] - # append phrase id - rowc.append(i) - - # avoid out of bounds errors - if len(cell) == 2: - lan = 2 - else: - lan = 3 + rowc.append(i) # append phrase id # add cell content - for j in range (0, lan): - content = cell[j] - text=(content.get_text()).rstrip() + for content in cell: + text = (content.get_text(" ", strip=True)).rstrip() rowc.append(text) - # append length of phrase - rowc.append(len(rowc[1])) - writer.writerow(rowc) + if len(rowc) > 1: + rowc.append(len(rowc[1])) + writer.writerow(rowc) i = i + 1 f.close() - # Strip empty lines - with open('phrases.csv', 'r+') as f: - lines = f.readlines() - f.seek(0) - f.writelines(line for line in lines if line.strip()) - f.truncate() - f.close() - if __name__ == "__main__": main()