#!/usr/bin/env python import csv import re from time import sleep from lxml import etree, html from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.expected_conditions import visibility_of_element_located # This is for test, load the urls list from https://dpaste.org/nZpuq urls_list = [ "https://www.flashscore.com/football/zimbabwe/premier-soccer-league-2021-2022/", ] for item in urls_list: base_url = item + "results/" print(f"Working on {item}") # As this code was created originaly in Linux, this is the source of Geecko driver. path = "/usr/local/bin/geckodriver" driver_service = webdriver.FirefoxService(executable_path=path) driver = webdriver.Firefox(service=driver_service) driver.get(base_url) # Reject cookies WebDriverWait(driver, 10).until(visibility_of_element_located((By.XPATH, '//button[contains(@id, "onetrust-reject-all-handler")]'))).location driver.find_element(By.XPATH, '//button[contains(@id, "onetrust-reject-all-handler")]').click() # Look for all content of results page, as it loads the content dynamically while True: html_content = html.fromstring(driver.page_source) tree = etree.ElementTree(html_content) show_more_item = html_content.xpath(".//span[text()='Show more matches']") if not show_more_item == []: try: while True: show_more_path = tree.getpath(html_content.xpath(".//span[text()='Show more matches']")[0].getparent()) driver.find_element(By.XPATH, show_more_path).send_keys(Keys.PAGE_DOWN) sleep(5) driver.find_element(By.XPATH, show_more_path).click() except NoSuchElementException: print("finish") break else: break html_content = html.fromstring(driver.page_source) driver.close() # Div container of all results results = html_content.xpath('//div[@id = "live-table"]//div[@class = "event event--results"]//div[@class = "leagues--static event--leagues results"]//div[@class = "sportName soccer"]')[0] standings = [] round = None tournament = None last_month = None year = None for row in results.getchildren(): if row.attrib.get("class") == "wcl-header_uBhYi wclLeagueHeader wclLeagueHeader--collapsed wclLeagueHeader--noCheckBox wclLeagueHeader--indent": tournament = row.text_content().replace("Standings", "").replace("Draw", "").replace(chr(160), "") if row.attrib.get("class") == "event__round event__round--static": round = row.text_content() if row.attrib.get("class").startswith("event__match"): date = row.xpath(".//div[@class = 'event__time']")[0].text_content().split()[0][:-1] years = re.search(r"\d{4}\-\d{4}", base_url) match = { "date": date, "time": row.xpath(".//div[@class = 'event__time']")[0].text_content().split()[1], "home_team": row.xpath('.//div[contains(@class, "homeParticipant")]')[0].text_content(), "away_team": row.xpath('.//div[contains(@class, "awayParticipant")]')[0].text_content(), "home_score": row.xpath('.//span[contains(@class, "event__score--home")]')[0].text_content(), "away_score": row.xpath('.//span[contains(@class, "event__score--away")]')[0].text_content() } if years: _years = years.group().split("-") if last_month == None: last_month = int(date.split(".")[-1]) if year == None: year = _years[-1] if int(date.split(".")[-1]) <= last_month: match["date"] = f"{date}.{year}" last_month = int(date.split(".")[-1]) else: print("change year") year = _years[0] match["date"] = f"{date}.{year}" last_month = int(date.split(".")[-1]) standings.append({"tournament": tournament, "round": round} | match) # Save results to file fieldnames = ['tournament', 'round', 'date', 'time', 'home_team', 'away_team', 'home_score', 'away_score'] with open('./tournament_results.csv', 'a', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(standings) sleep(5)