Source code for SurVigilance.ui.scrapers.scrape_lareb

"""
Scraper for LAREB using SeleniumBase.
"""

import os
import time
import warnings
from collections.abc import Callable
from typing import Any

import pandas as pd
from seleniumbase import SB

warnings.filterwarnings("ignore")


[docs] def scrape_lareb_sb( medicine: str, output_dir: str = "data/lareb", callback: Callable[[dict], None] | None = None, headless: bool = True, num_retries: int = 5, ) -> pd.DataFrame: """ Scrapes the reported MedDRA Preferred Terms and counts for a given medicine from Lareb. Parameters ----------- medicine : str Drug/medicine name to search. output_dir : str Directory to save CSV (default "data/lareb"). callback : callable Callable to receive UI/status events, called with a dict. This is essential to show progress to user. headless: bool Run the browser in headless mode (default True). num_retries: int Number of retries for data scraping after which error is thrown (default 5). Returns -------- A dataframe with columns ["PT", "Count"]. """ def _emit(event_type: str, **kw: Any) -> None: if callback: try: callback({"type": event_type, **kw}) except Exception: # pragma: no cover raise # pragma: no cover med = (medicine or "").strip() if not med: _emit("error", message="Medicine is required for Lareb scrape") raise ValueError("medicine is required") # pragma: no cover os.makedirs(output_dir, exist_ok=True) exceptions = [] for attempt in range(num_retries): try: if attempt > 0: _emit("log", message=f"Retrying... ({attempt + 1}/{num_retries})\n") with SB(uc=True, headless=headless) as sb: _emit("log", message=f"Parsing lareb.nl (Attempt {attempt + 1})\n") try: url = "https://www.lareb.nl/en" sb.activate_cdp_mode(url) sb.sleep(2) except Exception as e: # pragma: no cover _emit("error", message=f"Failed to open site: {e}") raise # pragma: no cover try: sb.sleep(1) sb.scroll_into_view("input.input-search") sb.sleep(1) if sb.cdp.is_element_present("input.input-search"): sb.cdp.type("input.input-search", med) else: sb.cdp.type('[class*="input-search"]', med) sb.sleep(2) except Exception as e: # pragma: no cover _emit("log", message=f"Error encountered while searching: {e}") raise # pragma: no cover try: sb.sleep(2) sb.cdp.wait_for_element_visible( 'div.autocomplete-suggestion[data-index="0"]', timeout=30 ) except Exception as e: # pragma: no cover _emit( "error", message=( "No autocomplete suggestion appeared - the drug may not exist " f"on Lareb: {med}. Details: {e}" ), ) if sb.cdp.is_element_present( 'div.autocomplete-suggestion[data-index="0"]' ): sb.sleep(1) sb.cdp.click_if_visible( 'div.autocomplete-suggestion[data-index="0"]' ) sb.sleep(2) try: sb.sleep(1.5) sb.cdp.click("#search") sb.sleep(3) except Exception as e: # pragma: no cover _emit("error", message=f"Couldn't click search button: {e}") raise # pragma: no cover try: sb.cdp.wait_for_element_visible("#registrationsTab", timeout=600) sb.sleep(1.5) sb.cdp.wait_for_element_visible( "#registrationsTab tbody tr", timeout=600 ) rows = sb.cdp.find_elements("#registrationsTab tbody tr") sb.sleep(2) except Exception as e: # pragma: no cover _emit("error", message=f"Couldn't find table: {e}") raise # pragma: no cover expanded_texts = [] total_rows = len(rows) if rows else 0 for i, row in enumerate(rows, start=1): try: sb.sleep(1) expander = row.query_selector("td > div:nth-of-type(1)") if expander: expander.click() sb.sleep(1.5) details = row.query_selector("td > div:nth-of-type(2)") if details: for _ in range(10): if details.text.strip(): break sb.sleep(0.3) expanded_texts.append(details.text.strip()) else: expanded_texts.append("") if total_rows: _emit("progress", delta=100.0 / total_rows) except Exception as e: # pragma: no cover msg = f"Row {i}: expand failed: {e}" _emit("error", message=msg) expanded_texts.append("") sb.sleep(2) data = [] for idx, text_block in enumerate(expanded_texts): if not text_block: continue for line in text_block.split("\n"): try: condition, count = line.rsplit(":", 1) data.append( {"PT": condition.strip(), "Count": int(count.strip())} ) except ValueError: # pragma: no cover _emit( "log", message=f"Skipping malformed line in group {idx + 1}: {line}", ) df = pd.DataFrame(data).reset_index(drop=True) target_name = f"{med}_lareb_adrs.csv" output_csv_path = os.path.join(output_dir, target_name) try: sb.sleep(1) df.to_csv(output_csv_path, index=False) _emit( "log", message=f"Data saved to: {os.path.abspath(output_csv_path)}", ) _emit( "download_complete", path=output_csv_path, filename=target_name, ) except Exception as e: # pragma: no cover _emit("error", message=f"Failed to save CSV: {e}") _emit("done") return df except Exception as e: # pragma: no cover exceptions.append(e) _emit("log", message=f"Attempt {attempt + 1} failed.\n") time.sleep(20) continue _emit( "error", message=( f"All {num_retries} attempt(s) to scrape data for {medicine} failed. " "Please check the following:\n" "1. Ensure you have a stable internet connection.\n" "2. Verify that 'https://www.lareb.nl/en' opens correctly in your Chrome browser.\n" "3. If these steps do not resolve the issue, please wait a while and retry. \n" "If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues " "for assistance.\n\n" ), ) raise RuntimeError( f"All {num_retries} attempt(s) to scrape data for {medicine} failed. " "Please check the following:\n" "1. Ensure you have a stable internet connection.\n" "2. Verify that 'https://www.lareb.nl/en' opens correctly in your Chrome browser.\n" "3. If these steps do not resolve the issue, please wait a while and retry. \n" "If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues " "for assistance.\n\n" )