from bs4 import BeautifulSoup, Tag as HTMLTag import json, re, argparse parser = argparse.ArgumentParser() parser.add_argument('filenames', nargs='*') args = parser.parse_args() def get_vacancies(ward): text = ward.parent.parent.h2.text ward_name = re.search("[^\(]*", text)[0].strip() vacancies = int(re.search("\([0-9]+", text)[0].strip("(")) return (ward_name, vacancies, ward) def get_candidate_names(ward_desc): names = [] for sibling in ward_desc[2].parent.next_siblings: if not isinstance(sibling, HTMLTag): continue if not (blocks := sibling.find_all('td', class_="list-item-body")): continue for block in blocks: names.append(re.sub('\n.*', '', block.text.strip())) return names def parse_lga(filename): with open(filename, 'r') as results_fp: html_doc = results_fp.read() soup = BeautifulSoup(html_doc, 'html.parser') wards0 = soup.find_all(string="Successful candidates") wards1 = soup.find_all(string="Elected candidates") ward_info = [] for ward in wards0: ward_info.append(get_vacancies(ward)) for ward in wards1: ward_info.append(get_vacancies(ward)) results = {} for ward in ward_info: names = get_candidate_names(ward) assert len(names) == ward[1] results[ward[0]] = names return results all_results = {} for lga in args.filenames: lga_name = re.sub('html/lgas/', '', lga) results = parse_lga(lga) all_results[lga_name] = results print(json.dumps(all_results, indent=4))