from bs4 import BeautifulSoup, Tag as HTMLTag
import json, re, argparse
parser = argparse.ArgumentParser()
parser.add_argument('filenames', nargs='*')
args = parser.parse_args()
def get_vacancies(ward):
text = ward.parent.parent.h2.text
ward_name = re.search("[^\(]*", text)[0].strip()
vacancies = int(re.search("\([0-9]+", text)[0].strip("("))
return (ward_name, vacancies, ward)
def get_candidate_names(ward_desc):
names = []
for sibling in ward_desc[2].parent.next_siblings:
if not isinstance(sibling, HTMLTag):
continue
if not (blocks := sibling.find_all('td', class_="list-item-body")):
continue
for block in blocks:
names.append(re.sub('\n.*', '', block.text.strip()))
return names
def parse_lga(filename):
with open(filename, 'r') as results_fp:
html_doc = results_fp.read()
soup = BeautifulSoup(html_doc, 'html.parser')
wards0 = soup.find_all(string="Successful candidates")
wards1 = soup.find_all(string="Elected candidates")
ward_info = []
for ward in wards0:
ward_info.append(get_vacancies(ward))
for ward in wards1:
ward_info.append(get_vacancies(ward))
results = {}
for ward in ward_info:
names = get_candidate_names(ward)
assert len(names) == ward[1]
results[ward[0]] = names
return results
all_results = {}
for lga in args.filenames:
lga_name = re.sub('html/lgas/', '', lga)
results = parse_lga(lga)
all_results[lga_name] = results
print(json.dumps(all_results, indent=4))