Source code for SurVigilance.ui.scrapers.scrape_vigiaccess
"""
Scraper for VigiAccess using SeleniumBase.
"""
import os
import time
import warnings
from collections.abc import Callable
from typing import Any
import pandas as pd
from seleniumbase import SB
warnings.filterwarnings("ignore")
[docs]
def scrape_vigiaccess_sb(
medicine: str,
output_dir: str = "data/vigiaccess",
callback: Callable[[dict], None] | None = None,
headless: bool = True,
num_retries: int = 5,
) -> pd.DataFrame:
"""
Scrapes the reported MedDRA Preferred Terms and counts for a given medicine from VigiAccess.
Parameters
-----------
medicine : str
Drug/medicine name to search.
output_dir : str
Directory to save CSV (default "data/vigiaccess").
callback : callable
Callable to receive UI/status events, called with a dict.
This is essential to show progress to user.
headless: bool
Run the browser in headless mode (default True).
num_retries: int
Number of retries for data scraping after which error is thrown (default 5).
Returns
--------
pd.DataFrame: A dataframe with columns ["PT", "Count"].
"""
def extract_clean_text(text_list: list[str]) -> list[str]: # pragma: no cover
cleaned = []
for text in text_list:
clean_str = "".join(
char for char in text if char.isalnum() or char.isspace()
)
cleaned.append(clean_str.strip().lower())
return cleaned
def _emit(event_type: str, **kw: Any) -> None: # pragma: no cover
if callback:
try:
callback({"type": event_type, **kw})
except Exception: # pragma: no cover
raise # pragma: no cover
os.makedirs(output_dir, exist_ok=True)
collected_lines = []
MAX_GROUPS = 26
exceptions = []
for attempt in range(num_retries):
try:
if attempt > 0:
_emit("log", message=f"Retrying... ({attempt + 1}/{num_retries})\n")
with SB(uc=True, headless=headless) as sb:
_emit(
"log", message=f"Parsing vigiaccess.org (Attempt {attempt + 1})\n"
)
try:
url = "https://www.vigiaccess.org/"
sb.activate_cdp_mode(url)
sb.sleep(0.5)
except Exception as e: # pragma: no cover
exceptions.append(e)
_emit("error", message=f"Failed to open site: {e}")
raise # pragma: no cover
try:
# sb.cdp.click(".level-left")
sb.cdp.scroll_into_view('//*[@id="elmish-app"]/footer')
if sb.is_element_visible(
'//*[@id="elmish-app"]/section/div/div[2]/nav/div[1]/div'
):
sb.cdp.click(
'//*[@id="elmish-app"]/section/div/div[2]/nav/div[1]/div/label'
)
sb.sleep(0.5)
sb.cdp.click(
'//*[@id="elmish-app"]/section/div/div[2]/nav/div[2]/div/button'
)
sb.sleep(0.5)
if sb.is_element_visible(".input"):
sb.type(".input", medicine)
sb.sleep(0.5)
sb.click(".button")
sb.sleep(1)
except Exception as e: # pragma: no cover
exceptions.append(e)
_emit("log", message=f"Search actions failed: {e}")
raise # pragma: no cover
try:
sb.cdp.wait_for_element_visible("td", timeout=20)
rows = sb.find_elements("tr")
row_text_list = [row.text for row in rows]
results = extract_clean_text(row_text_list)
try:
index = results.index(medicine.lower())
except ValueError:
index = 0 # Default to the first index if no match is found
sb.cdp.scroll_into_view(
f'//*[@id="elmish-app"]/div/section[1]/div/div/div[1]/div[2]/section/table/tbody/tr[{index + 1}]/td'
)
sb.cdp.click(
f'//*[@id="elmish-app"]/div/section[1]/div/div/div[1]/div[2]/section/table/tbody/tr[{index + 1}]/td'
)
sb.sleep(1.5)
sb.cdp.click_if_visible(
'//*[@id="elmish-app"]/div/section[1]/div/div/div[1]/div[2]/footer/button'
)
sb.sleep(1.5)
except Exception as e: # pragma: no cover
exceptions.append(e)
_emit("error", message=f"Failed entering results view: {e}")
raise # pragma: no cover
groups_xpath = '//*[@id="elmish-app"]/div/section[2]/div/div[2]/ul/li'
try:
sb.cdp.wait_for_element_visible(groups_xpath, timeout=20)
sb.sleep(0.5)
except Exception as e: # pragma: no cover
exceptions.append(e)
_emit("error", message=f"Reaction groups list not found: {e}")
raise # pragma: no cover
try:
group_items = sb.cdp.find_elements(groups_xpath)
total_groups = len(group_items)
if total_groups == 0:
_emit("log", message="No reaction groups found.")
except Exception: # pragma: no cover
total_groups = 0
for i in range(1, MAX_GROUPS + 1):
try:
title_span = f'//*[@id="elmish-app"]/div/section[2]/div/div[2]/ul/li[{i}]/span[1]'
expander_span = f'//*[@id="elmish-app"]/div/section[2]/div/div[2]/ul/li[{i}]/span[2]'
if not sb.cdp.is_element_present(title_span):
continue
# sb.cdp.gui_hover_element(title_span)
sb.cdp.click(expander_span)
sb.sleep(0.5)
while sb.cdp.is_element_visible(
'xpath=//*[contains(text(), "Load more...")]'
):
sb.cdp.click('//*[contains(text(), "Load more...")]')
sb.sleep(0.5)
entries_xpath = (
'//*[@id="elmish-app"]/div/section[2]/div/div[2]/ul/ul/li'
)
if sb.cdp.is_element_visible(entries_xpath):
entries = sb.cdp.find_elements(entries_xpath)
for el in entries:
txt = (el.text or "").strip()
if txt:
collected_lines.append(txt)
# sb.cdp.gui_hover_element(title_span)
sb.cdp.click(title_span)
sb.sleep(0.5)
except Exception as e: # pragma: no cover
exceptions.append(e)
_emit("log", message=f"Group {i}: skipping due to error: {e}")
_emit("progress", delta=100.0 / MAX_GROUPS)
data_map = {}
for raw in collected_lines:
line = raw.encode("ascii", "ignore").decode().strip()
if not line or "(" not in line or ")" not in line:
continue
try:
adr = line.rsplit("(", 1)[0].strip()
count = int(line.rsplit("(", 1)[1].split(")")[0])
data_map[adr] = count
except Exception: # pragma: no cover
raise # pragma: no cover
df = pd.DataFrame(data_map.items(), columns=["PT", "Count"]).reset_index(
drop=True
)
output_csv_path = os.path.join(
output_dir, f"{medicine}_vigiaccess_adrs.csv"
)
try:
df.to_csv(output_csv_path, index=False)
_emit(
"log", message=f"Data saved to: {os.path.abspath(output_csv_path)}"
)
except Exception as e: # pragma: no cover
_emit("error", message=f"Failed to save CSV: {e}")
_emit("done")
return df
except Exception as e: # pragma: no cover
exceptions.append(e)
_emit("log", message=f"Attempt {attempt + 1} failed.\n")
time.sleep(20)
continue
_emit(
"error",
message=(
f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
"Please check the following:\n"
"1. Ensure you have a stable internet connection.\n"
"2. Verify that 'https://www.vigiaccess.org/' opens correctly in your Chrome browser.\n"
"3. If these steps do not resolve the issue, please wait a while and retry. \n"
"If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
"for assistance.\n\n"
),
)
raise RuntimeError(
f"All {num_retries} attempt(s) to scrape data for {medicine} failed. "
"Please check the following:\n"
"1. Ensure you have a stable internet connection.\n"
"2. Verify that 'https://www.vigiaccess.org/' opens correctly in your Chrome browser.\n"
"3. If these steps do not resolve the issue, please wait a while and retry. \n"
"If problems persist, contact the developer at https://github.com/rmj3197/SurVigilance/issues "
"for assistance.\n\n"
)