Source code for SurVigilance.ui.scrapers.scrape_lareb

"""
Scraper for LAREB using SeleniumBase.
"""

import os
import time
import warnings
from collections.abc import Callable
from typing import Any

import pandas as pd
from seleniumbase import SB

warnings.filterwarnings("ignore")



[docs]
def scrape_lareb_sb(
    medicine: str,
    output_dir: str = "data/lareb",
    callback: Callable[[dict], None] | None = None,
    headless: bool = True,
    num_retries: int = 5,
) -> pd.DataFrame:
    """
    Scrapes the reported MedDRA Preferred Terms and counts for a given medicine from Lareb.

    Parameters
    -----------
    medicine : str
        Drug/medicine name to search.

    output_dir : str
        Directory to save CSV (default "data/lareb").

    callback : callable
        Callable to receive UI/status events, called with a dict.
        This is essential to show progress to user.

    headless: bool
        Run the browser in headless mode (default True).

    num_retries: int
        Number of retries for data scraping after which error is thrown (default 5).

    Returns
    --------
    A dataframe with columns ["PT", "Count"].
    """

    def _emit(event_type: str, **kw: Any) -> None:
        if callback:
            try:
                callback({"type": event_type, **kw})
            except Exception:  # pragma: no cover
                raise  # pragma: no cover

    med = (medicine or "").strip()
    if not med:
        _emit("error", message="Medicine is required for Lareb scrape")
        raise ValueError("medicine is required")  # pragma: no cover

    os.makedirs(output_dir, exist_ok=True)

    exceptions = []
    for attempt in range(num_retries):
        try:
            if attempt > 0:
                _emit("log", message=f"Retrying... ({attempt + 1}/{num_retries})\n")

            with SB(uc=True, headless=headless) as sb:
                _emit("log", message=f"Parsing lareb.nl (Attempt {attempt + 1})\n")
                try:
                    url = "https://www.lareb.nl/en"
                    sb.activate_cdp_mode(url)
                    sb.sleep(2)
                except Exception as e:  # pragma: no cover
                    _emit("error", message=f"Failed to open site: {e}")
                    raise  # pragma: no cover

                try:
                    sb.sleep(1)
                    sb.scroll_into_view("input.input-search")
                    sb.sleep(1)
                    if sb.cdp.is_element_present("input.input-search"):
                        sb.cdp.type("input.input-search", med)
                    else:
                        sb.cdp.type('[class*="input-search"]', med)
                    sb.sleep(2)
                except Exception as e:  # pragma: no cover
                    _emit("log", message=f"Error encountered while searching: {e}")
                    raise  # pragma: no cover

                try:
                    sb.sleep(2)
                    sb.cdp.wait_for_element_visible(
                        'div.autocomplete-suggestion[data-index="0"]', timeout=30
                    )
                except Exception as e:  # pragma: no cover
                    _emit(
                        "error",
                        message=(
                            "No autocomplete suggestion appeared - the drug may not exist "
                            f"on Lareb: {med}. Details: {e}"
                        ),
                    )

                if sb.cdp.is_element_present(
                    'div.autocomplete-suggestion[data-index="0"]'
                ):
                    sb.sleep(1)
                    sb.cdp.click_if_visible(
                        'div.autocomplete-suggestion[data-index="0"]'
                    )
                    sb.sleep(2)

                try:
                    sb.sleep(1.5)
                    sb.cdp.click("#search")
                    sb.sleep(3)
                except Exception as e:  # pragma: no cover
                    _emit("error", message=f"Couldn't click search button: {e}")
                    raise  # pragma: no cover

                try:
                    sb.cdp.wait_for_element_visible("#registrationsTab", timeout=600)
                    sb.sleep(1.5)
                    sb.cdp.wait_for_element_visible(
                        "#registrationsTab tbody tr", timeout=600
                    )
                    rows = sb.cdp.find_elements("#registrationsTab tbody tr")
                    sb.sleep(2)
                except Exception as e:  # pragma: no cover
                    _emit("error", message=f"Couldn't find table: {e}")
                    raise  # pragma: no cover

                expanded_texts = []
                total_rows = len(rows) if rows else 0

                for i, row in enumerate(rows, start=1):
                    try:
                        sb.sleep(1)
                        expander = row.query_selector("td > div:nth-of-type(1)")
                        if expander:
                            expander.click()
                            sb.sleep(1.5)

                        details = row.query_selector("td > div:nth-of-type(2)")
                        if details:
                            for _ in range(10):
                                if details.text.strip():
                                    break
                                sb.sleep(0.3)

                            expanded_texts.append(details.text.strip())
                        else:
                            expanded_texts.append("")

                        if total_rows:
                            _emit("progress", delta=100.0 / total_rows)

                    except Exception as e:  # pragma: no cover
                        msg = f"Row {i}: expand failed: {e}"
                        _emit("error", message=msg)
                        expanded_texts.append("")

                sb.sleep(2)

                data = []
                for idx, text_block in enumerate(expanded_texts):
                    if not text_block:
                        continue
                    for line in text_block.split("\n"):
                        try:
                            condition, count = line.rsplit(":", 1)
                            data.append(
                                {"PT": condition.strip(), "Count": int(count.strip())}
                            )
                        except ValueError:  # pragma: no cover
                            _emit(
                                "log",
                                message=f"Skipping malformed line in group {idx + 1}: {line}",
                            )

                df = pd.DataFrame(data).reset_index(drop=True)

                target_name = f"{med}_lareb_adrs.csv"
                output_csv_path = os.path.join(output_dir, target_name)
                try:
                    sb.sleep(1)
                    df.to_csv(output_csv_path, index=False)
                    _emit(
                        "log",
                        message=f"Data saved to: {os.path.abspath(output_csv_path)}",
                    )
                    _emit(
                        "download_complete",
                        path=output_csv_path,
                        filename=target_name,
                    )
                except Exception as e:  # pragma: no cover
                    _emit("error", message=f"Failed to save CSV: {e}")

                _emit("done")
                return df
        except Exception as e:  # pragma: no cover
            exceptions.append(e)
            _emit("log", message=f"Attempt {attempt + 1} failed.\n")
            time.sleep(20)
            continue

    _emit(
        "error",
        message=(
            f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
            "Please check the following:\n"
            "1. Ensure you have a stable internet connection.\n"
            "2. Verify that 'https://www.lareb.nl/en' opens correctly in your Chrome browser.\n"
            "3. If these steps do not resolve the issue, please wait a while and retry. \n"
            "If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
            "for assistance.\n\n"
        ),
    )
    raise RuntimeError(
        f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
        "Please check the following:\n"
        "1. Ensure you have a stable internet connection.\n"
        "2. Verify that 'https://www.lareb.nl/en' opens correctly in your Chrome browser.\n"
        "3. If these steps do not resolve the issue, please wait a while and retry. \n"
        "If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
        "for assistance.\n\n"
    )