54 lines
1.6 KiB
Python
54 lines
1.6 KiB
Python
from bs4 import BeautifulSoup, Tag as HTMLTag
|
|
import json, re, argparse
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('filenames', nargs='*')
|
|
args = parser.parse_args()
|
|
|
|
def get_vacancies(ward):
|
|
text = ward.parent.parent.h2.text
|
|
ward_name = re.search("[^\(]*", text)[0].strip()
|
|
vacancies = int(re.search("\([0-9]+", text)[0].strip("("))
|
|
return (ward_name, vacancies, ward)
|
|
|
|
def get_candidate_names(ward_desc):
|
|
names = []
|
|
for sibling in ward_desc[2].parent.next_siblings:
|
|
if not isinstance(sibling, HTMLTag):
|
|
continue
|
|
if not (blocks := sibling.find_all('td', class_="list-item-body")):
|
|
continue
|
|
for block in blocks:
|
|
names.append(re.sub('\n.*', '', block.text.strip()))
|
|
return names
|
|
|
|
def parse_lga(filename):
|
|
with open(filename, 'r') as results_fp:
|
|
html_doc = results_fp.read()
|
|
|
|
soup = BeautifulSoup(html_doc, 'html.parser')
|
|
wards0 = soup.find_all(string="Successful candidates")
|
|
wards1 = soup.find_all(string="Elected candidates")
|
|
|
|
ward_info = []
|
|
for ward in wards0:
|
|
ward_info.append(get_vacancies(ward))
|
|
for ward in wards1:
|
|
ward_info.append(get_vacancies(ward))
|
|
|
|
results = {}
|
|
for ward in ward_info:
|
|
names = get_candidate_names(ward)
|
|
assert len(names) == ward[1]
|
|
results[ward[0]] = names
|
|
|
|
return results
|
|
|
|
all_results = {}
|
|
for lga in args.filenames:
|
|
lga_name = re.sub('html/lgas/', '', lga)
|
|
results = parse_lga(lga)
|
|
all_results[lga_name] = results
|
|
|
|
print(json.dumps(all_results, indent=4))
|