Parse XML file using selenium and bs4?

i try to parse a xml-file using the following code:

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

options = Options()
# options.add_argument('--headless=new')  
options.add_argument("start-maximized")
options.add_argument('--log-level=3')  
options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 1})    
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled') 
srv=Service()
driver = webdriver.Chrome (service=srv, options=options)    
# driver.minimize_window()
waitWD = WebDriverWait (driver, 10)  

wLink = "https://projects.propublica.org/nonprofits/organizations/830370609"
driver.get(wLink) 
driver.execute_script("arguments[0].click();", waitWD.until(EC.element_to_be_clickable((By.XPATH, '(//a[text()="XML"])[1]'))))  
driver.switch_to.window(driver.window_handles[1])    
time.sleep(3) 
print(driver.current_url)
soup = BeautifulSoup (driver.page_source, 'lxml')   
worker = soup.find("PhoneNum")
print(worker)

But as you can see in the result i am for exmaple not able to parse the element "PhoneNum"

(selenium) C:\DEV\Fiverr2025\TRY\austibn>python test.py
https://pp-990-xml.s3.us-east-1.amazonaws.com/202403189349311780_public.xml?response-content-disposition=inline&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIA266MJEJYTM5WAG5Y%2F20250423%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250423T152903Z&X-Amz-Expires=1800&X-Amz-SignedHeaders=host&X-Amz-Signature=9743a63b41a906fac65c397a2bba7208938ca5b865f1e5a33c4f711769c815a4
None

How can i parse the xml-file from this site?

Answer

Fixes:

Use requests.get() to fetch the XML directly (faster and more reliable than Selenium for raw XML).
Parse with BeautifulSoup(..., 'xml') (not 'lxml', which is for HTML).
Close Selenium after getting the URL (since it's no longer needed).
Check if the tag exists before accessing .text.

soup.find("PhoneNum" will return first one phone number. However, I use find_all() to return all matching elements.

The full code with corrections:

import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

options = Options()
options.add_argument("start-maximized")
options.add_argument('--log-level=3')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_argument('--disable-blink-features=AutomationControlled')

srv = Service()
driver = webdriver.Chrome(service=srv, options=options)
waitWD = WebDriverWait(driver, 10)

url = "https://projects.propublica.org/nonprofits/organizations/830370609"
driver.get(url)

xml_button = waitWD.until(EC.element_to_be_clickable((By.XPATH, '(//a[text()="XML"])[1]')))
driver.execute_script("arguments[0].click();", xml_button)

driver.switch_to.window(driver.window_handles[1])
time.sleep(3)
xml_url = driver.current_url
driver.quit()

response = requests.get(xml_url)
if response.status_code != 200:
    print("Failed to download XML")
    exit()

soup = BeautifulSoup(response.content, 'xml')
phone_numbers = soup.find_all('PhoneNum')

if phone_numbers:
    print(f"Found {len(phone_numbers)} phone numbers:")
    for idx, phone in enumerate(phone_numbers, start=1):
        print(f"{idx}. {phone.text.strip()}")
else:
    print("No <PhoneNum> tags found in the XML.")

with open("propublica_data.xml", "wb") as f:
    f.write(response.content)
print("XML saved to 'propublica_data.xml'")

Output:

Found 4 phone numbers:
1. 6023146022
2. 6022687502
3. 6028812483
4. 6023146022
XML saved to 'propublica_data.xml'

Parse XML file using selenium and bs4?

Answer

Related Articles

How do I 'reset' an ft.Dropdown to its startup state after an option has been chosen?

Does the way you define nested lists in python matter? [duplicate]

Cannot communicate to SMTP server with socket and proxy due to timeout error

what is the oldest leap year in pandas?

How to reproduce `kneighbors_graph(include_self=True)` using `KNeighborsTransformer` in sklearn?

Moving Between Rooms v5