Source code for SurVigilance.ui.scrapers.scrape_vigiaccess

"""
Scraper for VigiAccess using SeleniumBase.
"""

import os
import time
import warnings
from collections.abc import Callable
from typing import Any

import pandas as pd
from seleniumbase import SB

warnings.filterwarnings("ignore")



[docs]
def scrape_vigiaccess_sb(
    medicine: str,
    output_dir: str = "data/vigiaccess",
    callback: Callable[[dict], None] | None = None,
    headless: bool = True,
    num_retries: int = 5,
) -> pd.DataFrame:
    """
    Scrapes the reported MedDRA Preferred Terms and counts for a given medicine from VigiAccess.

    Parameters
    -----------
    medicine : str
        Drug/medicine name to search.

    output_dir : str
        Directory to save CSV (default "data/vigiaccess").

    callback : callable
        Callable to receive UI/status events, called with a dict.
        This is essential to show progress to user.

    headless: bool
        Run the browser in headless mode (default True).

    num_retries: int
        Number of retries for data scraping after which error is thrown (default 5).

    Returns
    --------
    pd.DataFrame: A dataframe with columns ["PT", "Count"].
    """

    def extract_clean_text(text_list: list[str]) -> list[str]:  # pragma: no cover
        cleaned = []
        for text in text_list:
            clean_str = "".join(
                char for char in text if char.isalnum() or char.isspace()
            )
            cleaned.append(clean_str.strip().lower())
        return cleaned

    def _emit(event_type: str, **kw: Any) -> None:  # pragma: no cover
        if callback:
            try:
                callback({"type": event_type, **kw})
            except Exception:  # pragma: no cover
                raise  # pragma: no cover

    os.makedirs(output_dir, exist_ok=True)

    collected_lines = []
    MAX_GROUPS = 26

    exceptions = []
    for attempt in range(num_retries):
        try:
            if attempt > 0:
                _emit("log", message=f"Retrying... ({attempt + 1}/{num_retries})\n")

            with SB(uc=True, headless=headless) as sb:
                _emit(
                    "log", message=f"Parsing vigiaccess.org (Attempt {attempt + 1})\n"
                )
                try:
                    url = "https://www.vigiaccess.org/"
                    sb.activate_cdp_mode(url)
                    sb.sleep(0.5)
                except Exception as e:  # pragma: no cover
                    exceptions.append(e)
                    _emit("error", message=f"Failed to open site: {e}")
                    raise  # pragma: no cover

                try:
                    # sb.cdp.click(".level-left")
                    sb.cdp.scroll_into_view('//*[@id="elmish-app"]/footer')
                    if sb.is_element_visible(
                        '//*[@id="elmish-app"]/section/div/div[2]/nav/div[1]/div'
                    ):
                        sb.cdp.click(
                            '//*[@id="elmish-app"]/section/div/div[2]/nav/div[1]/div/label'
                        )
                    sb.sleep(0.5)
                    sb.cdp.click(
                        '//*[@id="elmish-app"]/section/div/div[2]/nav/div[2]/div/button'
                    )
                    sb.sleep(0.5)

                    if sb.is_element_visible(".input"):
                        sb.type(".input", medicine)
                    sb.sleep(0.5)
                    sb.click(".button")
                    sb.sleep(1)
                except Exception as e:  # pragma: no cover
                    exceptions.append(e)
                    _emit("log", message=f"Search actions failed: {e}")
                    raise  # pragma: no cover

                try:
                    sb.cdp.wait_for_element_visible("td", timeout=20)
                    rows = sb.find_elements("tr")
                    row_text_list = [row.text for row in rows]
                    results = extract_clean_text(row_text_list)
                    try:
                        index = results.index(medicine.lower())
                    except ValueError:
                        index = 0  # Default to the first index if no match is found

                    sb.cdp.scroll_into_view(
                        f'//*[@id="elmish-app"]/div/section[1]/div/div/div[1]/div[2]/section/table/tbody/tr[{index + 1}]/td'
                    )
                    sb.cdp.click(
                        f'//*[@id="elmish-app"]/div/section[1]/div/div/div[1]/div[2]/section/table/tbody/tr[{index + 1}]/td'
                    )

                    sb.sleep(1.5)

                    sb.cdp.click_if_visible(
                        '//*[@id="elmish-app"]/div/section[1]/div/div/div[1]/div[2]/footer/button'
                    )
                    sb.sleep(1.5)
                except Exception as e:  # pragma: no cover
                    exceptions.append(e)
                    _emit("error", message=f"Failed entering results view: {e}")
                    raise  # pragma: no cover

                groups_xpath = '//*[@id="elmish-app"]/div/section[2]/div/div[2]/ul/li'
                try:
                    sb.cdp.wait_for_element_visible(groups_xpath, timeout=20)
                    sb.sleep(0.5)
                except Exception as e:  # pragma: no cover
                    exceptions.append(e)
                    _emit("error", message=f"Reaction groups list not found: {e}")
                    raise  # pragma: no cover

                try:
                    group_items = sb.cdp.find_elements(groups_xpath)
                    total_groups = len(group_items)
                    if total_groups == 0:
                        _emit("log", message="No reaction groups found.")
                except Exception:  # pragma: no cover
                    total_groups = 0

                for i in range(1, MAX_GROUPS + 1):
                    try:
                        title_span = f'//*[@id="elmish-app"]/div/section[2]/div/div[2]/ul/li[{i}]/span[1]'
                        expander_span = f'//*[@id="elmish-app"]/div/section[2]/div/div[2]/ul/li[{i}]/span[2]'

                        if not sb.cdp.is_element_present(title_span):
                            continue

                        # sb.cdp.gui_hover_element(title_span)
                        sb.cdp.click(expander_span)

                        sb.sleep(0.5)

                        while sb.cdp.is_element_visible(
                            'xpath=//*[contains(text(), "Load more...")]'
                        ):
                            sb.cdp.click('//*[contains(text(), "Load more...")]')
                            sb.sleep(0.5)

                        entries_xpath = (
                            '//*[@id="elmish-app"]/div/section[2]/div/div[2]/ul/ul/li'
                        )
                        if sb.cdp.is_element_visible(entries_xpath):
                            entries = sb.cdp.find_elements(entries_xpath)
                            for el in entries:
                                txt = (el.text or "").strip()
                                if txt:
                                    collected_lines.append(txt)

                        # sb.cdp.gui_hover_element(title_span)
                        sb.cdp.click(title_span)
                        sb.sleep(0.5)

                    except Exception as e:  # pragma: no cover
                        exceptions.append(e)
                        _emit("log", message=f"Group {i}: skipping due to error: {e}")

                    _emit("progress", delta=100.0 / MAX_GROUPS)

            data_map = {}
            for raw in collected_lines:
                line = raw.encode("ascii", "ignore").decode().strip()
                if not line or "(" not in line or ")" not in line:
                    continue
                try:
                    adr = line.rsplit("(", 1)[0].strip()
                    count = int(line.rsplit("(", 1)[1].split(")")[0])
                    data_map[adr] = count
                except Exception:  # pragma: no cover
                    raise  # pragma: no cover

            df = pd.DataFrame(data_map.items(), columns=["PT", "Count"]).reset_index(
                drop=True
            )

            output_csv_path = os.path.join(
                output_dir, f"{medicine}_vigiaccess_adrs.csv"
            )
            try:
                df.to_csv(output_csv_path, index=False)
                _emit(
                    "log", message=f"Data saved to: {os.path.abspath(output_csv_path)}"
                )
            except Exception as e:  # pragma: no cover
                _emit("error", message=f"Failed to save CSV: {e}")

            _emit("done")
            return df

        except Exception as e:  # pragma: no cover
            exceptions.append(e)
            _emit("log", message=f"Attempt {attempt + 1} failed.\n")
            time.sleep(20)
            continue

    _emit(
        "error",
        message=(
            f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
            "Please check the following:\n"
            "1. Ensure you have a stable internet connection.\n"
            "2. Verify that 'https://www.vigiaccess.org/' opens correctly in your Chrome browser.\n"
            "3. If these steps do not resolve the issue, please wait a while and retry. \n"
            "If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
            "for assistance.\n\n"
        ),
    )

    raise RuntimeError(
        f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
        "Please check the following:\n"
        "1. Ensure you have a stable internet connection.\n"
        "2. Verify that 'https://www.vigiaccess.org/' opens correctly in your Chrome browser.\n"
        "3. If these steps do not resolve the issue, please wait a while and retry. \n"
        "If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
        "for assistance.\n\n"
    )