Source code for SurVigilance.ui.scrapers.scrape_vaers

"""
Downloader for VAERS yearly ZIPs.
"""

import os
import shutil
import time
from collections.abc import Callable
from typing import Any

import requests
from seleniumbase import SB


def vaers_intermediate_url(year: int) -> str:  # pragma: no cover
    return f"https://vaers.hhs.gov/eSubDownload/index.jsp?fn={year}VAERSData.zip"



[docs]
def download_vaers_zip_sb(
    year: int,
    download_dir: str = "data/vaers",
    timeout: int = 600,
    callback: Callable[[dict], None] | None = None,
    headless: bool = True,
    fallback_wait: int = 120,
) -> str:  # pragma: no cover
    """
    Navigate the VAERS intermediate page, solve CAPTCHA, and download the ZIP.

    Parameters
    -----------
    year: int
        Year of the VAERS data

    download_dir: str
        Directory to save the ZIP (default "data/vaers")

    timeout: int
        Max seconds for the file download request (default 600s or 10 mins).

    callback: callable, optional
        Callable to receive UI/status events, called with a dict.
        This is essential to show progress to user.

    headless: bool
        Run the browser in headless mode (default True).

    fallback_wait: Seconds to wait for a browser-initiated download to
        complete in browser default folder if the "Download File" button
        isn't found in time.

    Returns
    --------
    The full path of the downloaded ZIP file.
    """

    def _emit(event_type: str, **kw: Any) -> None:  # pragma: no cover
        if callback:
            try:
                callback({"type": event_type, **kw})
            except Exception:  # pragma: no cover
                raise  # pragma: no cover

    os.makedirs(download_dir, exist_ok=True)

    url = vaers_intermediate_url(int(year))
    _emit("log", message=f"Opening VAERS page for {year}")

    with SB(uc=True, headless=headless) as sb:
        sb.activate_cdp_mode(url)

        try:
            sb.uc_gui_click_captcha()
            _emit("log", message="Attempted CAPTCHA solve.")
        except Exception:  # pragma: no cover
            raise  # pragma: no cover

        download_xpath = "//*[self::a or self::button][contains(., 'Download File')]"
        try:
            sb.cdp.wait_for_element_visible(download_xpath, timeout=60)
        except Exception:  # pragma: no cover

            try:
                stray_dir = os.path.join(os.getcwd(), "downloaded_files")
                stray_name = f"{year}VAERSData.zip"
                stray_path = os.path.join(stray_dir, stray_name)
                partial_path = stray_path + ".crdownload"
                _emit(
                    "log",
                    message=(
                        "'Download File' control not detected in 60s. "
                        f"Waiting up to {fallback_wait}s for a browser-initiated download."
                    ),
                )

                end_time = time.time() + max(0, int(fallback_wait))
                while time.time() < end_time:
                    if os.path.isfile(stray_path) and not os.path.isfile(partial_path):
                        os.makedirs(download_dir, exist_ok=True)
                        target_path = os.path.join(download_dir, stray_name)
                        try:
                            if os.path.isfile(target_path):
                                os.remove(stray_path)
                            else:
                                shutil.move(stray_path, target_path)
                        except Exception:  # pragma: no cover
                            raise  # pragma: no cover
                        if os.path.isfile(target_path):
                            _emit(
                                "download_complete",
                                path=target_path,
                                filename=stray_name,
                            )
                            return target_path
                    time.sleep(1)
            except Exception:  # pragma: no cover
                raise  # pragma: no cover

        try:
            elem = sb.cdp.find_element(download_xpath)
        except Exception:  # pragma: no cover
            elem = None
        href = None
        if elem is not None:
            try:
                href = elem.get_attribute("href")
            except Exception:  # pragma: no cover
                href = None

        if not href and elem is not None:
            try:
                sb.cdp.click(download_xpath)
                sb.sleep(1.0)
                href = sb.cdp.get_current_url()
            except Exception:  # pragma: no cover
                href = None

        if not href:
            _emit("error", message="Unable to determine download URL after clicking.")
            raise RuntimeError(
                "Could not resolve direct download URL for VAERS zip"
            )  # pragma: no cover

        sess = requests.Session()
        try:
            ua = sb.cdp.execute_script("return navigator.userAgent") or ""
            if isinstance(ua, str) and ua:
                sess.headers.update({"User-Agent": ua})
        except Exception:  # pragma: no cover
            raise  # pragma: no cover

        try:
            for c in sb.cdp.driver.get_cookies():
                try:
                    sess.cookies.set(
                        c.get("name"),
                        c.get("value"),
                        domain=c.get("domain"),
                        path=c.get("path"),
                    )
                except Exception:  # pragma: no cover
                    raise  # pragma: no cover
        except Exception:  # pragma: no cover
            raise  # pragma: no cover

        _emit("log", message="Starting VAERS data download")
        with sess.get(href, stream=True, timeout=timeout) as r:
            r.raise_for_status()
            filename = f"{year}VAERSData.zip"
            file_path = os.path.join(download_dir, filename)

            try:
                total_bytes = int(
                    r.headers.get("Content-Length")
                    or r.headers.get("content-length")
                    or 0
                )
            except Exception:  # pragma: no cover
                total_bytes = 0

            _emit(
                "download_start",
                url=href,
                filename=filename,
                total_bytes=total_bytes,
            )

            downloaded = 0
            with open(file_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if not chunk:
                        continue
                    f.write(chunk)
                    downloaded += len(chunk)
                    if total_bytes > 0:
                        try:
                            percent = int(downloaded * 100 / max(1, total_bytes))
                            _emit(
                                "download_progress",
                                downloaded_bytes=downloaded,
                                total_bytes=total_bytes,
                                percent=percent,
                            )
                        except Exception:  # pragma: no cover
                            raise  # pragma: no cover

        _emit("download_complete", path=file_path, filename=filename)

        try:
            stray_dir = os.path.join(os.getcwd(), "downloaded_files")
            stray_name = f"{year}VAERSData.zip"
            stray_path = os.path.join(stray_dir, stray_name)
            target_path = os.path.join(download_dir, stray_name)
            if os.path.isfile(stray_path):

                if os.path.isfile(target_path):
                    try:
                        os.remove(stray_path)
                    except Exception:  # pragma: no cover
                        raise  # pragma: no cover
                else:
                    os.makedirs(download_dir, exist_ok=True)
                    shutil.move(stray_path, target_path)

                    file_path = target_path
        except Exception:  # pragma: no cover
            raise  # pragma: no cover

        return file_path