Source code for SurVigilance.ui.scrapers.scrape_daen

"""
Scraper for DAEN (TGA) using SeleniumBase.
"""

import os
import shutil
import time
from collections.abc import Callable
from typing import Any

import pandas as pd
from selenium.webdriver.common.keys import Keys
from seleniumbase import SB



[docs]
def scrape_daen_sb(
    medicine: str,
    output_dir: str = "data/daen",
    callback: Callable[[dict], None] | None = None,
    headless: bool = True,
    fallback_wait: int = 240,
    num_retries: int = 5,
) -> pd.DataFrame:  # pragma: no cover
    """
    Scrapes the reported MedDRA Preferred Terms and counts for a given medicine
    from the Australian DAEN database.

    Parameters
    -----------
    medicine: str
        Drug/medicine name to search.

    output_dir: str
        Directory to save the Excel (.xlsx) data file (default "data/daen").

    callback: callable, optional
        Callable to receive UI/status events, called with a dict.
        This is essential to show progress to user.

    headless: bool
        Run the browser in headless mode (default True).

    fallback_wait: int
        Seconds to wait for the browser download to finish in its default
        folder before attempting to move it to `output_dir`.

    num_retries: int
        Number of retries for data scraping after which error is thrown (default 5).

    Returns
    --------
    A dataFrame of the downloaded data.
    """

    def _emit(event_type: str, **kw: Any) -> None:  # pragma: no cover
        if callback:
            try:
                callback({"type": event_type, **kw})
            except Exception:  # pragma: no cover
                raise  # pragma: no cover

    med = (medicine or "").strip()
    if not med:
        _emit("error", message="Medicine is required for DAEN scrape")
        raise ValueError("medicine is required")  # pragma: no cover

    os.makedirs(output_dir, exist_ok=True)

    exceptions = []
    for attempt in range(num_retries):
        try:
            if attempt > 0:
                _emit("log", message=f"Retrying... ({attempt + 1}/{num_retries})\n")

            url = "https://daen.tga.gov.au/medicines-search/"
            _emit(
                "log",
                message=f"Opening DAEN (TGA) medicines search (Attempt {attempt + 1})\n",
            )

            with SB(uc=True, headless=headless) as sb:
                sb.open(url)

                try:
                    sb.scroll_into_view("input#termsCondition")
                    sb.click_if_visible("input#termsCondition", timeout=5)
                except Exception:  # pragma: no cover
                    pass

                outer_iframe = "div#reportContainer iframe"
                sb.wait_for_element_present(outer_iframe, timeout=60)
                with sb.frame_switch(outer_iframe):
                    inner_iframe = 'iframe[name="visual-sandbox"]'
                    sb.wait_for_element_present(inner_iframe, timeout=60)
                    with sb.frame_switch(inner_iframe):
                        search_box = 'input[placeholder*="Search"]'
                        sb.wait_for_element(search_box, timeout=60)
                        sb.clear(search_box)
                        sb.type(search_box, med)
                        sb.press_keys(search_box, Keys.ENTER)

                    try:
                        sb.wait_for_element(
                            '//*[@id="pvExplorationHost"]/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[34]/transform'
                        )
                        sb.sleep(10)
                        sb.scroll_into_view(
                            '//*[@id="pvExplorationHost"]/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[34]/transform'
                        )

                        row = '//*[@id="pvExplorationHost"]/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[34]/transform/div/div[3]/div/div/visual-modern/div/div/div[2]/div[1]/div[2]'
                        sb.click(row)
                        sb.sleep(1)

                        options_menu = '//*[@id="pvExplorationHost"]/div/div/exploration/div/explore-canvas/div/div[2]/div/div[2]/div[2]/visual-container-repeat/visual-container[34]/transform/div/visual-container-header/div/div/div/visual-container-options-menu'
                        sb.click(options_menu)
                        sb.sleep(1)

                        export_btn = '//*[@id="0"]'
                        sb.click(export_btn)
                        sb.sleep(1)

                        confirm_btn = '//*[@id="mat-mdc-dialog-0"]/div/div/export-data-dialog/mat-dialog-actions/button[1]'
                        sb.click(confirm_btn)
                    except Exception as e:  # pragma: no cover  # pragma: no cover
                        _emit("log", message=f"Export initiation failed: {e}")
                        raise  # pragma: no cover

                    sb.sleep(5)

                try:
                    stray_dir = os.path.join(os.getcwd(), "downloaded_files")
                    os.makedirs(output_dir, exist_ok=True)

                    end_time = time.time() + max(0, int(fallback_wait))
                    last_candidate = None
                    while time.time() < end_time:
                        if os.path.isdir(stray_dir):
                            entries = [
                                os.path.join(stray_dir, f)
                                for f in os.listdir(stray_dir)
                                if not f.endswith(".crdownload")
                                and not f.startswith(".")
                            ]
                            if entries:
                                entries.sort(
                                    key=lambda p: os.path.getmtime(p), reverse=True
                                )
                                last_candidate = entries[0]
                                break
                        time.sleep(1)

                    if not last_candidate or not os.path.isfile(last_candidate):
                        _emit(
                            "error",
                            message=(
                                "No completed download detected in 'downloaded_files' within "
                                f"{fallback_wait}s."
                            ),
                        )
                        raise RuntimeError(
                            "DAEN export file not detected"
                        )  # pragma: no cover

                    base_name = os.path.basename(last_candidate)
                    _root, ext = os.path.splitext(base_name)
                    if not ext:
                        ext = ".xlsx"
                    target_name = f"{med}_daen_export{ext}"
                    target_path = os.path.join(output_dir, target_name)

                    if os.path.isfile(target_path):
                        try:
                            os.remove(last_candidate)
                        except Exception:
                            raise  # pragma: no cover
                    else:
                        shutil.move(last_candidate, target_path)

                    _emit(
                        "log",
                        message=f"Data saved to: {os.path.abspath(target_path)}",
                    )

                    _emit("download_complete", path=target_path, filename=target_name)

                    try:
                        # The DAEN export is expected to be an Excel .xlsx file.
                        df = pd.read_excel(target_path, engine="openpyxl")
                        return df
                    except Exception as e:  # pragma: no cover  # pragma: no cover
                        _emit("log", message=f"Failed to read exported file: {e}")
                        raise  # pragma: no cover

                except Exception:  # pragma: no cover
                    raise  # pragma: no cover
        except Exception as e:  # pragma: no cover
            exceptions.append(e)
            _emit("log", message=f"Attempt {attempt + 1} failed with error: {e}.\n")
            time.sleep(20)
            continue

    _emit(
        "error",
        message=(
            f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
            "Please check the following:\n"
            "1. Ensure you have a stable internet connection.\n"
            "2. Verify that 'https://daen.tga.gov.au/medicines-search/' opens correctly in your Chrome browser.\n"
            "3. If these steps do not resolve the issue, please wait a while and retry. \n"
            "If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
            "for assistance.\n\n"
        ),
    )

    raise RuntimeError(
        f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
        "Please check the following:\n"
        "1. Ensure you have a stable internet connection.\n"
        "2. Verify that 'https://daen.tga.gov.au/medicines-search/' opens correctly in your Chrome browser.\n"
        "3. If these steps do not resolve the issue, please wait a while and retry. \n"
        "If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
        "for assistance.\n\n"
    )