"""
Scraper for DAEN (TGA) using SeleniumBase.
"""
import os
import shutil
import time
from collections.abc import Callable
from typing import Any
import pandas as pd
from selenium.webdriver.common.keys import Keys
from seleniumbase import SB
[docs]
def scrape_daen_sb(
medicine: str,
output_dir: str = "data/daen",
callback: Callable[[dict], None] | None = None,
headless: bool = True,
fallback_wait: int = 240,
num_retries: int = 5,
) -> pd.DataFrame: # pragma: no cover
"""
Scrapes the reported MedDRA Preferred Terms and counts for a given medicine
from the Australian DAEN database.
Parameters
-----------
medicine: str
Drug/medicine name to search.
output_dir: str
Directory to save the Excel (.xlsx) data file (default "data/daen").
callback: callable, optional
Callable to receive UI/status events, called with a dict.
This is essential to show progress to user.
headless: bool
Run the browser in headless mode (default True).
fallback_wait: int
Seconds to wait for the browser download to finish in its default
folder before attempting to move it to `output_dir`.
num_retries: int
Number of retries for data scraping after which error is thrown (default 5).
Returns
--------
A dataFrame of the downloaded data.
"""
def _emit(event_type: str, **kw: Any) -> None: # pragma: no cover
if callback:
try:
callback({"type": event_type, **kw})
except Exception: # pragma: no cover
raise # pragma: no cover
med = (medicine or "").strip()
if not med:
_emit("error", message="Medicine is required for DAEN scrape")
raise ValueError("medicine is required") # pragma: no cover
os.makedirs(output_dir, exist_ok=True)
exceptions = []
for attempt in range(num_retries):
try:
if attempt > 0:
_emit("log", message=f"Retrying... ({attempt + 1}/{num_retries})\n")
url = "https://daen.tga.gov.au/medicines-search/"
_emit(
"log",
message=f"Opening DAEN (TGA) medicines search (Attempt {attempt + 1})\n",
)
with SB(uc=True, headless=headless) as sb:
sb.open(url)
try:
sb.scroll_into_view("input#termsCondition")
sb.click_if_visible("input#termsCondition", timeout=5)
except Exception: # pragma: no cover
pass
outer_iframe = "div#reportContainer iframe"
sb.wait_for_element_present(outer_iframe, timeout=60)
with sb.frame_switch(outer_iframe):
inner_iframe = 'iframe[name="visual-sandbox"]'
sb.wait_for_element_present(inner_iframe, timeout=60)
with sb.frame_switch(inner_iframe):
search_box = 'input[placeholder*="Search"]'
sb.wait_for_element(search_box, timeout=60)
sb.clear(search_box)
sb.type(search_box, med)
sb.press_keys(search_box, Keys.ENTER)
try:
sb.wait_for_element(
'//*[@id="pvExplorationHost"]/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[34]/transform'
)
sb.sleep(10)
sb.scroll_into_view(
'//*[@id="pvExplorationHost"]/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[34]/transform'
)
row = '//*[@id="pvExplorationHost"]/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[34]/transform/div/div[3]/div/div/visual-modern/div/div/div[2]/div[1]/div[2]'
sb.click(row)
sb.sleep(1)
options_menu = '//*[@id="pvExplorationHost"]/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[34]/transform/div/visual-container-header/div/div/div/visual-container-options-menu'
sb.click(options_menu)
sb.sleep(1)
export_btn = '//*[@id="0"]'
sb.click(export_btn)
sb.sleep(1)
confirm_btn = '//*[@id="mat-mdc-dialog-0"]/div/div/export-data-dialog/mat-dialog-actions/button[1]'
sb.click(confirm_btn)
except Exception as e: # pragma: no cover # pragma: no cover
_emit("log", message=f"Export initiation failed: {e}")
raise # pragma: no cover
sb.sleep(5)
try:
stray_dir = os.path.join(os.getcwd(), "downloaded_files")
os.makedirs(output_dir, exist_ok=True)
end_time = time.time() + max(0, int(fallback_wait))
last_candidate = None
while time.time() < end_time:
if os.path.isdir(stray_dir):
entries = [
os.path.join(stray_dir, f)
for f in os.listdir(stray_dir)
if not f.endswith(".crdownload")
and not f.startswith(".")
]
if entries:
entries.sort(
key=lambda p: os.path.getmtime(p), reverse=True
)
last_candidate = entries[0]
break
time.sleep(1)
if not last_candidate or not os.path.isfile(last_candidate):
_emit(
"error",
message=(
"No completed download detected in 'downloaded_files' within "
f"{fallback_wait}s."
),
)
raise RuntimeError(
"DAEN export file not detected"
) # pragma: no cover
base_name = os.path.basename(last_candidate)
_root, ext = os.path.splitext(base_name)
if not ext:
ext = ".xlsx"
target_name = f"{med}_daen_export{ext}"
target_path = os.path.join(output_dir, target_name)
if os.path.isfile(target_path):
try:
os.remove(last_candidate)
except Exception:
raise # pragma: no cover
else:
shutil.move(last_candidate, target_path)
_emit(
"log",
message=f"Data saved to: {os.path.abspath(target_path)}",
)
_emit("download_complete", path=target_path, filename=target_name)
try:
# The DAEN export is expected to be an Excel .xlsx file.
df = pd.read_excel(target_path, engine="openpyxl")
return df
except Exception as e: # pragma: no cover # pragma: no cover
_emit("log", message=f"Failed to read exported file: {e}")
raise # pragma: no cover
except Exception: # pragma: no cover
raise # pragma: no cover
except Exception as e: # pragma: no cover
exceptions.append(e)
_emit("log", message=f"Attempt {attempt + 1} failed with error: {e}.\n")
time.sleep(20)
continue
_emit(
"error",
message=(
f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
"Please check the following:\n"
"1. Ensure you have a stable internet connection.\n"
"2. Verify that 'https://daen.tga.gov.au/medicines-search/' opens correctly in your Chrome browser.\n"
"3. If these steps do not resolve the issue, please wait a while and retry. \n"
"If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
"for assistance.\n\n"
),
)
raise RuntimeError(
f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
"Please check the following:\n"
"1. Ensure you have a stable internet connection.\n"
"2. Verify that 'https://daen.tga.gov.au/medicines-search/' opens correctly in your Chrome browser.\n"
"3. If these steps do not resolve the issue, please wait a while and retry. \n"
"If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
"for assistance.\n\n"
)