uc-cdis · k-burt-uch · Jun 4, 2026 · Jun 4, 2026
diff --git a/gen3/external/nih/dbgap_fhir.py b/gen3/external/nih/dbgap_fhir.py
@@ -2,6 +2,7 @@
 import collections
 import copy
 import csv
+import re
 import time
 
 from cdislogging import get_logger
@@ -133,6 +134,15 @@ def main():
         "MolecularDataTypes",
     ]
 
+    # regex to detect markdown links e.g. `[Click Me](javascript:`
+    # capture groups:
+    #   \1 -> The visible link text ("Click Me")
+    #   \2 -> protocol ("javascript")
+    UNSAFE_MARKDOWN_LINK_START_RE = re.compile(
+        r"\[([^\]]*)\]\((javascript|vbscript|data):",
+        re.IGNORECASE,
+    )
+
     DISCLAIMER = (
         "This information was retrieved from dbGaP's FHIR API for "
         "discoverability purposes and may not contain fully up-to-date "
@@ -575,16 +585,67 @@ def _capitalize_top_level_keys(all_data):
 
     def _clean_value(self, value):
         """
-        Replace tab literals in a string
+        Clean a string for downstream output.
+
+        This combines two sanitization concerns in one pass:
+        1) Strip unsafe markdown links like [text](javascript:...)
+        2) Escape literal tab characters/backslashes for TSV safety
         """
         if value is None:
             return ""
 
+        value = self._strip_unsafe_markdown_links(value)
+
         # Double-escape existing backslashes
         # Convert every literal tab into the text “\t”
         value = value.replace("\\", "\\\\").replace("\t", r"\t")
         return value
 
+    def _strip_unsafe_markdown_links(self, value):
+        """
+        Replace markdown links using unsafe schemes with only their link text.
+
+        This parser handles nested parentheses, difficult with regex
+        """
+        output = []
+        cursor = 0
+
+        while True:
+            match = self.UNSAFE_MARKDOWN_LINK_START_RE.search(value, cursor)
+            if not match:
+                output.append(value[cursor:])
+                break
+
+            output.append(value[cursor : match.start()])
+            link_text = match.group(1)
+
+            # start after "[text](scheme:"
+            index = match.end()
+            depth = 1
+            while index < len(value) and depth > 0:
+                if value[index] == "(":
+                    depth += 1
+                elif value[index] == ")":
+                    depth -= 1
+                index += 1
+
+            if depth == 0:
+                output.append(link_text)
+                cursor = index
+            else:
+                # in the case of unclosed parens/malformed link
+                # defuse the protocol name to make it un-executable.
+                # this is to save trailing textt.
+                link_text = match.group(1)
+                protocol = match.group(2)
+
+                defused_start = f"[{link_text}](DEFUSED_{protocol}:"
+                output.append(defused_start)
+
+                cursor = match.end()
+
+        return "".join(output)
+
     def _clean_structure(self, obj):
         """
         Recursively walk a nested structure (dicts, lists, tuples) and clean every string

diff --git a/tests/test_dbgap_fhir.py b/tests/test_dbgap_fhir.py
@@ -5,6 +5,7 @@
 import os
 import pytest
 import sys
+from copy import deepcopy
 
 import requests
 from requests.auth import HTTPBasicAuth
@@ -157,3 +158,58 @@ def _mock_request(path, **kwargs):
     assert _get_tsv_data(file_name) == _get_tsv_data(
         "tests/test_data/fhir_metadata.tsv"
     )
+
+
+def test_dbgap_fhir_sanitizes_unsafe_markdown_links():
+    dbgap_fhir = dbgapFHIR(
+        api="https://example.com/fhir/x1",
+        auth_provider=HTTPBasicAuth("DATACITE_USERNAME", "DATACITE_PASSWORD"),
+    )
+
+    unsafe_response = deepcopy(MOCK_NIH_DBGAP_FHIR_RESPONSE_FOR_PHS000007)
+    unsafe_response[
+        "description"
+    ] = "Description with [malformed JS](javascript:getPage(this, 'document.cgi', 2022 and also [unsafe link](javascript:getPage(this, 'document.cgi', 2022);return true;) and context"
+    unsafe_response["keyword"][0][
+        "text"
+    ] = "[JS](javascript:getPage(this, 'document.cgi', 2022);return true;)"
+    unsafe_response["keyword"][1]["text"] = "[VB](vbscript:msgbox(1))"
+    unsafe_response["keyword"][2]["text"] = "[DATA](data:text/html;base64,PHNjcmlwdD4=)"
+
+    clean_response = deepcopy(MOCK_NIH_DBGAP_FHIR_RESPONSE_FOR_PHS000166)
+
+    def _mock_request(path, **kwargs):
+        if path == "ResearchStudy/phs000007":
+            return unsafe_response
+        if path == "ResearchStudy/phs000166":
+            return clean_response
+
+        assert path in ["ResearchStudy/phs000007", "ResearchStudy/phs000166"]
+
+    dbgap_fhir.fhir_client.server.request_json = MagicMock(side_effect=_mock_request)
+
+    metadata = dbgap_fhir.get_metadata_for_ids(
+        [
+            "phs000007.v1.p1.c1",
+            "phs000166.c3",
+        ]
+    )
+
+    unsafe_metadata = metadata["phs000007.v1.p1.c1"]
+    assert (
+        unsafe_metadata["Description"]
+        == "Description with [malformed JS](DEFUSED_javascript:getPage(this, 'document.cgi', 2022 and also unsafe link and context"
+    )
+
+    assert isinstance(unsafe_metadata["Keyword"], list)
+    for item in unsafe_metadata["Keyword"]:
+        assert "javascript:" not in item.lower()
+        assert "vbscript:" not in item.lower()
+        assert "data:text/html" not in item.lower()
+
+    assert "JS" in unsafe_metadata["Keyword"]
+    assert "VB" in unsafe_metadata["Keyword"]
+    assert "DATA" in unsafe_metadata["Keyword"]
+
+    clean_metadata = metadata["phs000166.c3"]
+    assert clean_metadata["Description"] == clean_response["description"]