diff --git a/gen3/external/nih/dbgap_fhir.py b/gen3/external/nih/dbgap_fhir.py index 00e1c6a6..caed7318 100644 --- a/gen3/external/nih/dbgap_fhir.py +++ b/gen3/external/nih/dbgap_fhir.py @@ -2,6 +2,7 @@ import collections import copy import csv +import re import time from cdislogging import get_logger @@ -133,6 +134,15 @@ def main(): "MolecularDataTypes", ] + # regex to detect markdown links e.g. `[Click Me](javascript:` + # capture groups: + # \1 -> The visible link text ("Click Me") + # \2 -> protocol ("javascript") + UNSAFE_MARKDOWN_LINK_START_RE = re.compile( + r"\[([^\]]*)\]\((javascript|vbscript|data):", + re.IGNORECASE, + ) + DISCLAIMER = ( "This information was retrieved from dbGaP's FHIR API for " "discoverability purposes and may not contain fully up-to-date " @@ -575,16 +585,67 @@ def _capitalize_top_level_keys(all_data): def _clean_value(self, value): """ - Replace tab literals in a string + Clean a string for downstream output. + + This combines two sanitization concerns in one pass: + 1) Strip unsafe markdown links like [text](javascript:...) + 2) Escape literal tab characters/backslashes for TSV safety """ if value is None: return "" + value = self._strip_unsafe_markdown_links(value) + # Double-escape existing backslashes # Convert every literal tab into the text “\t” value = value.replace("\\", "\\\\").replace("\t", r"\t") return value + def _strip_unsafe_markdown_links(self, value): + """ + Replace markdown links using unsafe schemes with only their link text. + + This parser handles nested parentheses, difficult with regex + """ + output = [] + cursor = 0 + + while True: + match = self.UNSAFE_MARKDOWN_LINK_START_RE.search(value, cursor) + if not match: + output.append(value[cursor:]) + break + + output.append(value[cursor : match.start()]) + link_text = match.group(1) + + # start after "[text](scheme:" + index = match.end() + depth = 1 + while index < len(value) and depth > 0: + if value[index] == "(": + depth += 1 + elif value[index] == ")": + depth -= 1 + index += 1 + + if depth == 0: + output.append(link_text) + cursor = index + else: + # in the case of unclosed parens/malformed link + # defuse the protocol name to make it un-executable. + # this is to save trailing textt. + link_text = match.group(1) + protocol = match.group(2) + + defused_start = f"[{link_text}](DEFUSED_{protocol}:" + output.append(defused_start) + + cursor = match.end() + + return "".join(output) + def _clean_structure(self, obj): """ Recursively walk a nested structure (dicts, lists, tuples) and clean every string diff --git a/tests/test_dbgap_fhir.py b/tests/test_dbgap_fhir.py index fb75e40c..bb2548df 100644 --- a/tests/test_dbgap_fhir.py +++ b/tests/test_dbgap_fhir.py @@ -5,6 +5,7 @@ import os import pytest import sys +from copy import deepcopy import requests from requests.auth import HTTPBasicAuth @@ -157,3 +158,58 @@ def _mock_request(path, **kwargs): assert _get_tsv_data(file_name) == _get_tsv_data( "tests/test_data/fhir_metadata.tsv" ) + + +def test_dbgap_fhir_sanitizes_unsafe_markdown_links(): + dbgap_fhir = dbgapFHIR( + api="https://example.com/fhir/x1", + auth_provider=HTTPBasicAuth("DATACITE_USERNAME", "DATACITE_PASSWORD"), + ) + + unsafe_response = deepcopy(MOCK_NIH_DBGAP_FHIR_RESPONSE_FOR_PHS000007) + unsafe_response[ + "description" + ] = "Description with [malformed JS](javascript:getPage(this, 'document.cgi', 2022 and also [unsafe link](javascript:getPage(this, 'document.cgi', 2022);return true;) and context" + unsafe_response["keyword"][0][ + "text" + ] = "[JS](javascript:getPage(this, 'document.cgi', 2022);return true;)" + unsafe_response["keyword"][1]["text"] = "[VB](vbscript:msgbox(1))" + unsafe_response["keyword"][2]["text"] = "[DATA](data:text/html;base64,PHNjcmlwdD4=)" + + clean_response = deepcopy(MOCK_NIH_DBGAP_FHIR_RESPONSE_FOR_PHS000166) + + def _mock_request(path, **kwargs): + if path == "ResearchStudy/phs000007": + return unsafe_response + if path == "ResearchStudy/phs000166": + return clean_response + + assert path in ["ResearchStudy/phs000007", "ResearchStudy/phs000166"] + + dbgap_fhir.fhir_client.server.request_json = MagicMock(side_effect=_mock_request) + + metadata = dbgap_fhir.get_metadata_for_ids( + [ + "phs000007.v1.p1.c1", + "phs000166.c3", + ] + ) + + unsafe_metadata = metadata["phs000007.v1.p1.c1"] + assert ( + unsafe_metadata["Description"] + == "Description with [malformed JS](DEFUSED_javascript:getPage(this, 'document.cgi', 2022 and also unsafe link and context" + ) + + assert isinstance(unsafe_metadata["Keyword"], list) + for item in unsafe_metadata["Keyword"]: + assert "javascript:" not in item.lower() + assert "vbscript:" not in item.lower() + assert "data:text/html" not in item.lower() + + assert "JS" in unsafe_metadata["Keyword"] + assert "VB" in unsafe_metadata["Keyword"] + assert "DATA" in unsafe_metadata["Keyword"] + + clean_metadata = metadata["phs000166.c3"] + assert clean_metadata["Description"] == clean_response["description"]