"""
Scraper for Denmark DMA interactive ADR overviews using SeleniumBase.
"""
import os
import time
from collections.abc import Callable
from io import StringIO
from typing import Any
import pandas as pd
from bs4 import BeautifulSoup
from seleniumbase import SB
def _group_label(name: str) -> str | None: # pragma: no cover
name = (name or "").strip().lower()
second = name[1]
if "a" <= second <= "d":
return "a-d"
elif "e" <= second <= "h":
return "e-h"
elif "i" <= second <= "l":
return "i-l"
elif "m" <= second <= "p":
return "m-p"
elif "q" <= second <= "u":
return "q-u"
else:
return "v-z"
[docs]
def scrape_dma_sb(
medicine: str,
output_dir: str = "data/dma",
callback: Callable[[dict], None] | None = None,
headless: bool = True,
num_retries: int = 5,
) -> pd.DataFrame:
"""
Scrapes the reported MedDRA Preferred Terms and counts for a given medicine
from the Danish Medicines Agency database.
Parameters
-----------
medicine: str
Drug/medicine name to search.
output_dir: str
Directory to save CSV (default "data/dma").
callback : callable
Callable to receive UI/status events, called with a dict.
This is essential to show progress to user.
headless: bool
Run the browser in headless mode (default True).
num_retries: int
Number of retries for data scraping after which error is thrown (default 5).
Returns
--------
A dataframe with columns ['PT', 'Count'].
"""
def _emit(event_type: str, **kw: Any) -> None:
if callback:
try:
callback({"type": event_type, **kw})
except Exception: # pragma: no cover
raise # pragma: no cover
med = (medicine or "").strip()
if not med:
_emit("error", message="Medicine is required for DMA scrape")
raise ValueError("medicine is required") # pragma: no cover
os.makedirs(output_dir, exist_ok=True)
exceptions = []
for attempt in range(num_retries):
try:
if attempt > 0:
_emit("log", message=f"Retrying... ({attempt + 1}/{num_retries})\n")
with SB(uc=True, headless=headless) as sb:
url = "https://laegemiddelstyrelsen.dk/en/sideeffects/side-effects-of-medicines/interactive-adverse-drug-reaction-overviews/"
_emit(
"log",
message=f"Opening laegemiddelstyrelsen.dk (DMA) (Attempt {attempt + 1})\n",
)
sb.activate_cdp_mode(url)
_emit("progress", delta=20.0)
sb.sleep(1)
try:
sb.cdp.click_if_visible(
'//*[@id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]'
)
sb.sleep(1)
except Exception: # pragma: no cover
pass
try:
sb.click(
'//*[@id="main-content"]/div/div/div[2]/div[1]/form/div/input'
)
sb.sleep(1)
except Exception: # pragma: no cover
pass
try:
first_char = med[0].upper()
if not ("A" <= first_char <= "Z"):
raise ValueError(
"Unsupported starting character for medicine"
) # pragma: no cover
alphabet_index = ord(first_char) - ord("A") + 1
sb.click(
f'//*[@id="main-content"]/div/div/div[2]/div[1]/section/div[2]/div[1]/a[{alphabet_index}]'
)
sb.sleep(1)
except Exception as e: # pragma: no cover
_emit("log", message=f"Failed selecting alphabet: {e}\n")
try:
group = _group_label(med)
if group:
sb.click(
f'a[href="?letter={med[0].upper()}&subletter={group}"]'
)
sb.sleep(1)
except Exception as e: # pragma: no cover
_emit("log", message=f"Skipping subgroup selection: {e}\n")
drugs_table_xpath = (
'//*[@id="main-content"]/div/div/div[2]/div[1]/section/table'
)
sb.wait_for_element_visible(drugs_table_xpath, timeout=30)
sb.sleep(2)
_emit("progress", delta=20.0)
table_text = sb.cdp.get_text(drugs_table_xpath) or ""
if med.lower() not in table_text.lower():
_emit("error", message=f"Drug '{med}' not found in DMA list")
raise RuntimeError("Drug not found in DMA list") # pragma: no cover
sb.click(
f"//*[translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz') = '{med.lower()}']"
)
sb.sleep(5)
tabs = sb.cdp.get_tabs()
for tab in tabs:
if (
tab.url
): # the first tab needs to be closed else it is creating issues.
sb.cdp.switch_to_tab(tab)
sb.cdp.close_active_tab()
break
else:
print("No tab without a URL was found.")
sb.wait_for_ready_state_complete()
_emit("progress", delta=20.0)
outer_iframe = 'iframe[src*="/upload/dap/dap.html?drug=./DK_EXTERNAL/NONCOMBINED/"]'
sb.wait_for_element_visible(outer_iframe, timeout=30)
if sb.is_element_present(outer_iframe):
with sb.frame_switch(outer_iframe):
try:
sb.click_if_visible("button#soc_expand_all_button")
except Exception as e: # pragma: no cover
_emit("log", message=f"Expand-all click issue: {e}")
if sb.is_element_present("#meddra_table"):
table_el = sb.find_element("#meddra_table")
table_html = table_el.get_attribute("outerHTML")
soup = BeautifulSoup(table_html, "html.parser")
table = soup.find("table", {"id": "meddra_table"})
df = pd.read_html(StringIO(str(table)))[0]
df.columns = [str(c).strip() for c in df.columns]
df = df.dropna(axis=1, how="all")
if df.shape[1] >= 2:
pt_col = df.columns[0]
count_col = df.columns[-2]
df = df.loc[:, [pt_col, count_col]]
df.columns = ["PT", "Count"]
df = df[df["PT"].astype(str).str.contains("\\+")]
df["PT"] = (
df["PT"]
.astype(str)
.str.replace("+", "", regex=False)
.str.strip()
)
sb.sleep(5)
df = df.reset_index(drop=True)
_emit("progress", delta=20.0)
target_name = f"{med}_dma_adrs.csv"
out_path = os.path.join(output_dir, target_name)
try:
df.to_csv(out_path, index=False)
_emit(
"log",
message=f"Data saved to: {os.path.abspath(out_path)}",
)
_emit(
"download_complete",
path=out_path,
filename=target_name,
)
_emit("progress", delta=20.0)
except Exception as e: # pragma: no cover
_emit("error", message=f"Failed to save CSV: {e}")
raise # pragma: no cover
sb.sleep(5)
_emit("done")
return df
except Exception as e: # pragma: no cover
exceptions.append(e)
_emit("log", message=f"Attempt {attempt + 1} failed.\n")
time.sleep(20)
continue
_emit(
"error",
message=(
f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
"Please check the following:\n"
"1. Ensure you have a stable internet connection.\n"
"2. Verify that 'https://laegemiddelstyrelsen.dk/en/sideeffects/side-effects-of-medicines/interactive-adverse-drug-reaction-overviews/' opens correctly in your Chrome browser.\n"
"3. If these steps do not resolve the issue, please wait a while and retry. \n"
"If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
"for assistance.\n\n"
),
)
raise RuntimeError(
f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
"Please check the following:\n"
"1. Ensure you have a stable internet connection.\n"
"2. Verify that 'https://laegemiddelstyrelsen.dk/en/sideeffects/side-effects-of-medicines/interactive-adverse-drug-reaction-overviews/' opens correctly in your Chrome browser.\n"
"3. If these steps do not resolve the issue, please wait a while and retry. \n"
"If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
"for assistance.\n\n"
)