Source code for SurVigilance.ui.scrapers.scrape_dma

"""
Scraper for Denmark DMA interactive ADR overviews using SeleniumBase.
"""

import os
import time
from collections.abc import Callable
from io import StringIO
from typing import Any

import pandas as pd
from bs4 import BeautifulSoup
from seleniumbase import SB


def _group_label(name: str) -> str | None:  # pragma: no cover
    name = (name or "").strip().lower()
    second = name[1]
    if "a" <= second <= "d":
        return "a-d"
    elif "e" <= second <= "h":
        return "e-h"
    elif "i" <= second <= "l":
        return "i-l"
    elif "m" <= second <= "p":
        return "m-p"
    elif "q" <= second <= "u":
        return "q-u"
    else:
        return "v-z"


[docs] def scrape_dma_sb( medicine: str, output_dir: str = "data/dma", callback: Callable[[dict], None] | None = None, headless: bool = True, num_retries: int = 5, ) -> pd.DataFrame: """ Scrapes the reported MedDRA Preferred Terms and counts for a given medicine from the Danish Medicines Agency database. Parameters ----------- medicine: str Drug/medicine name to search. output_dir: str Directory to save CSV (default "data/dma"). callback : callable Callable to receive UI/status events, called with a dict. This is essential to show progress to user. headless: bool Run the browser in headless mode (default True). num_retries: int Number of retries for data scraping after which error is thrown (default 5). Returns -------- A dataframe with columns ['PT', 'Count']. """ def _emit(event_type: str, **kw: Any) -> None: if callback: try: callback({"type": event_type, **kw}) except Exception: # pragma: no cover raise # pragma: no cover med = (medicine or "").strip() if not med: _emit("error", message="Medicine is required for DMA scrape") raise ValueError("medicine is required") # pragma: no cover os.makedirs(output_dir, exist_ok=True) exceptions = [] for attempt in range(num_retries): try: if attempt > 0: _emit("log", message=f"Retrying... ({attempt + 1}/{num_retries})\n") with SB(uc=True, headless=headless) as sb: url = "https://laegemiddelstyrelsen.dk/en/sideeffects/side-effects-of-medicines/interactive-adverse-drug-reaction-overviews/" _emit( "log", message=f"Opening laegemiddelstyrelsen.dk (DMA) (Attempt {attempt + 1})\n", ) sb.activate_cdp_mode(url) _emit("progress", delta=20.0) sb.sleep(1) try: sb.cdp.click_if_visible( '//*[@id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]' ) sb.sleep(1) except Exception: # pragma: no cover pass try: sb.click( '//*[@id="main-content"]/div/div/div[2]/div[1]/form/div/input' ) sb.sleep(1) except Exception: # pragma: no cover pass try: first_char = med[0].upper() if not ("A" <= first_char <= "Z"): raise ValueError( "Unsupported starting character for medicine" ) # pragma: no cover alphabet_index = ord(first_char) - ord("A") + 1 sb.click( f'//*[@id="main-content"]/div/div/div[2]/div[1]/section/div[2]/div[1]/a[{alphabet_index}]' ) sb.sleep(1) except Exception as e: # pragma: no cover _emit("log", message=f"Failed selecting alphabet: {e}\n") try: group = _group_label(med) if group: sb.click( f'a[href="?letter={med[0].upper()}&subletter={group}"]' ) sb.sleep(1) except Exception as e: # pragma: no cover _emit("log", message=f"Skipping subgroup selection: {e}\n") drugs_table_xpath = ( '//*[@id="main-content"]/div/div/div[2]/div[1]/section/table' ) sb.wait_for_element_visible(drugs_table_xpath, timeout=30) sb.sleep(2) _emit("progress", delta=20.0) table_text = sb.cdp.get_text(drugs_table_xpath) or "" if med.lower() not in table_text.lower(): _emit("error", message=f"Drug '{med}' not found in DMA list") raise RuntimeError("Drug not found in DMA list") # pragma: no cover sb.click( f"//*[translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = '{med.lower()}']" ) sb.sleep(5) tabs = sb.cdp.get_tabs() for tab in tabs: if ( tab.url ): # the first tab needs to be closed else it is creating issues. sb.cdp.switch_to_tab(tab) sb.cdp.close_active_tab() break else: print("No tab without a URL was found.") sb.wait_for_ready_state_complete() _emit("progress", delta=20.0) outer_iframe = 'iframe[src*="/upload/dap/dap.html?drug=./DK_EXTERNAL/NONCOMBINED/"]' sb.wait_for_element_visible(outer_iframe, timeout=30) if sb.is_element_present(outer_iframe): with sb.frame_switch(outer_iframe): try: sb.click_if_visible("button#soc_expand_all_button") except Exception as e: # pragma: no cover _emit("log", message=f"Expand-all click issue: {e}") if sb.is_element_present("#meddra_table"): table_el = sb.find_element("#meddra_table") table_html = table_el.get_attribute("outerHTML") soup = BeautifulSoup(table_html, "html.parser") table = soup.find("table", {"id": "meddra_table"}) df = pd.read_html(StringIO(str(table)))[0] df.columns = [str(c).strip() for c in df.columns] df = df.dropna(axis=1, how="all") if df.shape[1] >= 2: pt_col = df.columns[0] count_col = df.columns[-2] df = df.loc[:, [pt_col, count_col]] df.columns = ["PT", "Count"] df = df[df["PT"].astype(str).str.contains("\\+")] df["PT"] = ( df["PT"] .astype(str) .str.replace("+", "", regex=False) .str.strip() ) sb.sleep(5) df = df.reset_index(drop=True) _emit("progress", delta=20.0) target_name = f"{med}_dma_adrs.csv" out_path = os.path.join(output_dir, target_name) try: df.to_csv(out_path, index=False) _emit( "log", message=f"Data saved to: {os.path.abspath(out_path)}", ) _emit( "download_complete", path=out_path, filename=target_name, ) _emit("progress", delta=20.0) except Exception as e: # pragma: no cover _emit("error", message=f"Failed to save CSV: {e}") raise # pragma: no cover sb.sleep(5) _emit("done") return df except Exception as e: # pragma: no cover exceptions.append(e) _emit("log", message=f"Attempt {attempt + 1} failed.\n") time.sleep(20) continue _emit( "error", message=( f"All {num_retries} attempt(s) to scrape data for {medicine} failed. " "Please check the following:\n" "1. Ensure you have a stable internet connection.\n" "2. Verify that 'https://laegemiddelstyrelsen.dk/en/sideeffects/side-effects-of-medicines/interactive-adverse-drug-reaction-overviews/' opens correctly in your Chrome browser.\n" "3. If these steps do not resolve the issue, please wait a while and retry. \n" "If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues " "for assistance.\n\n" ), ) raise RuntimeError( f"All {num_retries} attempt(s) to scrape data for {medicine} failed. " "Please check the following:\n" "1. Ensure you have a stable internet connection.\n" "2. Verify that 'https://laegemiddelstyrelsen.dk/en/sideeffects/side-effects-of-medicines/interactive-adverse-drug-reaction-overviews/' opens correctly in your Chrome browser.\n" "3. If these steps do not resolve the issue, please wait a while and retry. \n" "If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues " "for assistance.\n\n" )