Results parser working for all LGAs (except melbourne)
This commit is contained in:
@@ -1,24 +1,53 @@
|
||||
from bs4 import BeautifulSoup, Tag as HTMLTag
|
||||
import json, re, argparse
|
||||
|
||||
with open("boroondara-city-council", 'r') as results_fp:
|
||||
html_doc = results_fp.read()
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('filenames', nargs='*')
|
||||
args = parser.parse_args()
|
||||
|
||||
soup = BeautifulSoup(html_doc, 'html.parser')
|
||||
candidates0 = soup.find_all(string="Successful candidates")
|
||||
candidates1 = soup.find_all(string="Elected candidates")
|
||||
def get_vacancies(ward):
|
||||
text = ward.parent.parent.h2.text
|
||||
ward_name = re.search("[^\(]*", text)[0].strip()
|
||||
vacancies = int(re.search("\([0-9]+", text)[0].strip("("))
|
||||
return (ward_name, vacancies, ward)
|
||||
|
||||
def get_candidate_name(candidate):
|
||||
for sibling in candidate.parent.next_siblings:
|
||||
def get_candidate_names(ward_desc):
|
||||
names = []
|
||||
for sibling in ward_desc[2].parent.next_siblings:
|
||||
if not isinstance(sibling, HTMLTag):
|
||||
continue
|
||||
if not (block := sibling.find('td', class_="list-item-body")):
|
||||
if not (blocks := sibling.find_all('td', class_="list-item-body")):
|
||||
continue
|
||||
return block.text.strip()
|
||||
for block in blocks:
|
||||
names.append(re.sub('\n.*', '', block.text.strip()))
|
||||
return names
|
||||
|
||||
names = []
|
||||
for candidate in candidates0:
|
||||
names.append(get_candidate_name(candidate))
|
||||
for candidate in candidates1:
|
||||
names.append(get_candidate_name(candidate))
|
||||
def parse_lga(filename):
|
||||
with open(filename, 'r') as results_fp:
|
||||
html_doc = results_fp.read()
|
||||
|
||||
print(names)
|
||||
soup = BeautifulSoup(html_doc, 'html.parser')
|
||||
wards0 = soup.find_all(string="Successful candidates")
|
||||
wards1 = soup.find_all(string="Elected candidates")
|
||||
|
||||
ward_info = []
|
||||
for ward in wards0:
|
||||
ward_info.append(get_vacancies(ward))
|
||||
for ward in wards1:
|
||||
ward_info.append(get_vacancies(ward))
|
||||
|
||||
results = {}
|
||||
for ward in ward_info:
|
||||
names = get_candidate_names(ward)
|
||||
assert len(names) == ward[1]
|
||||
results[ward[0]] = names
|
||||
|
||||
return results
|
||||
|
||||
all_results = {}
|
||||
for lga in args.filenames:
|
||||
lga_name = re.sub('html/lgas/', '', lga)
|
||||
results = parse_lga(lga)
|
||||
all_results[lga_name] = results
|
||||
|
||||
print(json.dumps(all_results, indent=4))
|
||||
|
||||
Reference in New Issue
Block a user