Snippet content copied to clipboard.
Are you sure to delete this snippet? No, don't delete
  1. #!/usr/bin/env python
  2. import csv
  3. import re
  4. from time import sleep
  5. from lxml import etree, html
  6. from selenium import webdriver
  7. from selenium.common.exceptions import NoSuchElementException
  8. from selenium.webdriver.common.keys import Keys
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.support.expected_conditions import visibility_of_element_located
  12. # This is for test, load the urls list from https://dpaste.org/nZpuq
  13. urls_list = [
  14. "https://www.flashscore.com/football/zimbabwe/premier-soccer-league-2021-2022/",
  15. ]
  16. for item in urls_list:
  17. base_url = item + "results/"
  18. print(f"Working on {item}")
  19. # As this code was created originaly in Linux, this is the source of Geecko driver.
  20. path = "/usr/local/bin/geckodriver"
  21. driver_service = webdriver.FirefoxService(executable_path=path)
  22. driver = webdriver.Firefox(service=driver_service)
  23. driver.get(base_url)
  24. # Reject cookies
  25. WebDriverWait(driver, 10).until(visibility_of_element_located((By.XPATH, '//button[contains(@id, "onetrust-reject-all-handler")]'))).location
  26. driver.find_element(By.XPATH, '//button[contains(@id, "onetrust-reject-all-handler")]').click()
  27. # Look for all content of results page, as it loads the content dynamically
  28. while True:
  29. html_content = html.fromstring(driver.page_source)
  30. tree = etree.ElementTree(html_content)
  31. show_more_item = html_content.xpath(".//span[text()='Show more matches']")
  32. if not show_more_item == []:
  33. try:
  34. while True:
  35. show_more_path = tree.getpath(html_content.xpath(".//span[text()='Show more matches']")[0].getparent())
  36. driver.find_element(By.XPATH, show_more_path).send_keys(Keys.PAGE_DOWN)
  37. sleep(5)
  38. driver.find_element(By.XPATH, show_more_path).click()
  39. except NoSuchElementException:
  40. print("finish")
  41. break
  42. else:
  43. break
  44. html_content = html.fromstring(driver.page_source)
  45. driver.close()
  46. # Div container of all results
  47. results = html_content.xpath('//div[@id = "live-table"]//div[@class = "event event--results"]//div[@class = "leagues--static event--leagues results"]//div[@class = "sportName soccer"]')[0]
  48. standings = []
  49. round = None
  50. tournament = None
  51. last_month = None
  52. year = None
  53. for row in results.getchildren():
  54. if row.attrib.get("class") == "wcl-header_uBhYi wclLeagueHeader wclLeagueHeader--collapsed wclLeagueHeader--noCheckBox wclLeagueHeader--indent":
  55. tournament = row.text_content().replace("Standings", "").replace("Draw", "").replace(chr(160), "")
  56. if row.attrib.get("class") == "event__round event__round--static":
  57. round = row.text_content()
  58. if row.attrib.get("class").startswith("event__match"):
  59. date = row.xpath(".//div[@class = 'event__time']")[0].text_content().split()[0][:-1]
  60. years = re.search(r"\d{4}\-\d{4}", base_url)
  61. match = {
  62. "date": date,
  63. "time": row.xpath(".//div[@class = 'event__time']")[0].text_content().split()[1],
  64. "home_team": row.xpath('.//div[contains(@class, "homeParticipant")]')[0].text_content(),
  65. "away_team": row.xpath('.//div[contains(@class, "awayParticipant")]')[0].text_content(),
  66. "home_score": row.xpath('.//span[contains(@class, "event__score--home")]')[0].text_content(),
  67. "away_score": row.xpath('.//span[contains(@class, "event__score--away")]')[0].text_content()
  68. }
  69. if years:
  70. _years = years.group().split("-")
  71. if last_month == None: last_month = int(date.split(".")[-1])
  72. if year == None: year = _years[-1]
  73. if int(date.split(".")[-1]) <= last_month:
  74. match["date"] = f"{date}.{year}"
  75. last_month = int(date.split(".")[-1])
  76. else:
  77. print("change year")
  78. year = _years[0]
  79. match["date"] = f"{date}.{year}"
  80. last_month = int(date.split(".")[-1])
  81. standings.append({"tournament": tournament, "round": round} | match)
  82. # Save results to file
  83. fieldnames = ['tournament', 'round', 'date', 'time', 'home_team', 'away_team', 'home_score', 'away_score']
  84. with open('./tournament_results.csv', 'a', newline='') as csvfile:
  85. writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  86. writer.writeheader()
  87. writer.writerows(standings)
  88. sleep(5)

Edit this Snippet