From f8fd1cc20c4f66f7062188e9e37f44e282301486 Mon Sep 17 00:00:00 2001 From: Kim Taylor Date: Sat, 16 Nov 2024 12:03:29 +1100 Subject: [PATCH] Results parser working for all LGAs (except melbourne) --- results/fetch.sh | 14 +++- results/gen-elected.php | 158 ++++++++++++++++++++++++++++++++++++++++ results/parser.py | 59 +++++++++++---- update-elected.sh | 21 ++++++ 4 files changed, 235 insertions(+), 17 deletions(-) create mode 100644 results/gen-elected.php create mode 100755 update-elected.sh diff --git a/results/fetch.sh b/results/fetch.sh index 6184537..45a9fa8 100755 --- a/results/fetch.sh +++ b/results/fetch.sh @@ -1,5 +1,15 @@ #!/bin/bash -council_name=boroondara-city-council +mkdir -p html +wget https://www.vec.vic.gov.au/results/2024-council-election-results -O html/lga_list.html -wget https://www.vec.vic.gov.au/voting/2024-local-council-elections/$council_name/results -O $council_name +IFS=$'\n' + +lgas=$(grep 'href="/voting/.*/results"' html/lga_list.html) + +for lga in $lgas ; do + lga=$(sed 's|.*href="|https://www.vec.vic.gov.au|' <<< $lga) + lga=$(sed 's|">.*||' <<< $lga) + file=$(sed 's|.*elections/||' <<< $lga | sed s'|/results||') + wget $lga -O html/$file +done diff --git a/results/gen-elected.php b/results/gen-elected.php new file mode 100644 index 0000000..c6c742d --- /dev/null +++ b/results/gen-elected.php @@ -0,0 +1,158 @@ + $value) { + $candidate[$value] = $data[$key]; + } + $candidate['Council'] = $config['councilName']; + $name_slug = trim_sluggify($candidate['Candidate Name']); + $candidate_data[$name_slug] = $candidate; + } + } +} + +print_r($candidate_data); + +/* Get list of elected candidates */ + +//$lga_list = []; +/* Generate dictionary of LGAs and Wards */ +//foreach ($config_files as $config_file) { +// $config_string = file_get_contents($config_file); +// if ($config_string !== FALSE) { +// $config = json_decode($config_string, true); +// } else { +// error_log("Error opening config.json."); +// exit(1); +// } +// $config['config-file'] = $config_file; +// $lga_list[] = $config; +//} + +/* Match user typed LGA/Ward to our database */ +//match_lga($candidate_data, $lga_list); + +$header = ["Ward", "Candidate Name", "Rating", "Pledge", "Picture"]; + +/* Generate candidates-generic.csv */ +//foreach ($lga_list as $lga) { +// $lga_candidates = array_filter($candidate_data, function ($candidate) use ($lga) { +// return $candidate['match_lga'] === $lga['slug']; +// }); +// +// if (count($lga_candidates) === 0) continue; +// +// remove_duplicates($lga_candidates); +// +// $dir = dirname($lga['config-file']); +// $dir_files = scandir($dir); +// $output_file = $dir."/candidates-generic.csv"; +// $override_file = $dir."/candidates-override.csv"; +// +// if (($handle = fopen($output_file, "w")) === FALSE) { +// error_log('Error opening output file'); +// exit(1); +// } +// +// if (fputcsv($handle, $header) === FALSE) { +// error_log('Error writing headers to output file'); +// exit(3); +// } +// +// $lines = []; +// foreach ($lga_candidates as $candidate) { +// /* Add extension to photo hash */ +// if (strlen($candidate['Photo'])) { +// foreach ($dir_files as $file) { +// if (preg_match("/\.json$/", $file)) continue; +// if (strstr($file, $candidate['Photo'])) { +// $candidate['Photo'] = $file; +// } +// } +// } +// +// $lines[] = [ +// $candidate['match_ward'], +// $candidate['Name'], +// $candidate['Score'], +// $candidate['Pledge'], +// $candidate['Photo'], +// ]; +// } +// +// /* Apply overrides if they exist */ +// $overrides = []; +// if (file_exists($override_file)) { +// if (($ovr_handle = fopen($override_file, "r")) !== FALSE) { +// $headers = fgetcsv($ovr_handle); +// while (($data = fgetcsv($ovr_handle)) !== FALSE) { +// $override = []; +// foreach ($headers as $key => $value) { +// $override[$value] = $data[$key]; +// } +// $overrides[] = $override; +// } +// fclose($ovr_handle); +// } else { +// error_log('Error opening overrides file'); +// exit(3); +// } +// } +// +// foreach ($overrides as $override) { +// foreach ($lines as $line_key => $line) { +// $match_index = array_search($override['Match Field'], $header); +// $replace_index = array_search($override['Replace Field'], $header); +// if ($line[$match_index] === $override['Match Value']) { +// if ($replace_index !== false) +// $lines[$line_key][$replace_index] = $override['Replace Value']; +// else /* If 'Replace Field' is not matched - delete this entry */ +// $lines[$line_key]['Delete'] = 'y'; +// } +// } +// } +// +// foreach ($lines as $line) { +// if (isset($line['Delete'])) continue; +// if (fputcsv($handle, $line) === FALSE) { +// error_log('Error writing candidate to output file'); +// exit(3); +// } +// } +// fclose($handle); +//} + +exit(0); diff --git a/results/parser.py b/results/parser.py index 734e593..3875741 100644 --- a/results/parser.py +++ b/results/parser.py @@ -1,24 +1,53 @@ from bs4 import BeautifulSoup, Tag as HTMLTag +import json, re, argparse -with open("boroondara-city-council", 'r') as results_fp: - html_doc = results_fp.read() +parser = argparse.ArgumentParser() +parser.add_argument('filenames', nargs='*') +args = parser.parse_args() -soup = BeautifulSoup(html_doc, 'html.parser') -candidates0 = soup.find_all(string="Successful candidates") -candidates1 = soup.find_all(string="Elected candidates") +def get_vacancies(ward): + text = ward.parent.parent.h2.text + ward_name = re.search("[^\(]*", text)[0].strip() + vacancies = int(re.search("\([0-9]+", text)[0].strip("(")) + return (ward_name, vacancies, ward) -def get_candidate_name(candidate): - for sibling in candidate.parent.next_siblings: +def get_candidate_names(ward_desc): + names = [] + for sibling in ward_desc[2].parent.next_siblings: if not isinstance(sibling, HTMLTag): continue - if not (block := sibling.find('td', class_="list-item-body")): + if not (blocks := sibling.find_all('td', class_="list-item-body")): continue - return block.text.strip() + for block in blocks: + names.append(re.sub('\n.*', '', block.text.strip())) + return names -names = [] -for candidate in candidates0: - names.append(get_candidate_name(candidate)) -for candidate in candidates1: - names.append(get_candidate_name(candidate)) +def parse_lga(filename): + with open(filename, 'r') as results_fp: + html_doc = results_fp.read() -print(names) + soup = BeautifulSoup(html_doc, 'html.parser') + wards0 = soup.find_all(string="Successful candidates") + wards1 = soup.find_all(string="Elected candidates") + + ward_info = [] + for ward in wards0: + ward_info.append(get_vacancies(ward)) + for ward in wards1: + ward_info.append(get_vacancies(ward)) + + results = {} + for ward in ward_info: + names = get_candidate_names(ward) + assert len(names) == ward[1] + results[ward[0]] = names + + return results + +all_results = {} +for lga in args.filenames: + lga_name = re.sub('html/lgas/', '', lga) + results = parse_lga(lga) + all_results[lga_name] = results + +print(json.dumps(all_results, indent=4)) diff --git a/update-elected.sh b/update-elected.sh new file mode 100755 index 0000000..ac599fe --- /dev/null +++ b/update-elected.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# This script uses the jq, wp, and php commands, make sure they are installed before running this script. + +# The folder containing data for each council. +# Includes the list of candidates and any media. +DATA_PATH="../spl-data" + +# Iterate over folders in data path +candidates_files=() +for folder in "$DATA_PATH"/*; do + if test -f "$folder"/candidates-generic.csv; then + candidates_files+=("$folder"/candidates-generic.csv) + fi + # Community groups get priority + if test -f "$folder"/candidates.csv; then + candidates_files+=("$folder"/candidates.csv) + fi +done + +php results/gen-elected.php --candidates-files "${candidates_files[*]}"