Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion gen3/external/nih/dbgap_fhir.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import collections
import copy
import csv
import re
import time

from cdislogging import get_logger
Expand Down Expand Up @@ -133,6 +134,15 @@ def main():
"MolecularDataTypes",
]

# regex to detect markdown links e.g. `[Click Me](javascript:`
# capture groups:
# \1 -> The visible link text ("Click Me")
# \2 -> protocol ("javascript")
UNSAFE_MARKDOWN_LINK_START_RE = re.compile(
r"\[([^\]]*)\]\((javascript|vbscript|data):",
re.IGNORECASE,
)

DISCLAIMER = (
"This information was retrieved from dbGaP's FHIR API for "
"discoverability purposes and may not contain fully up-to-date "
Expand Down Expand Up @@ -575,16 +585,67 @@ def _capitalize_top_level_keys(all_data):

def _clean_value(self, value):
"""
Replace tab literals in a string
Clean a string for downstream output.

This combines two sanitization concerns in one pass:
1) Strip unsafe markdown links like [text](javascript:...)
2) Escape literal tab characters/backslashes for TSV safety
"""
if value is None:
return ""

value = self._strip_unsafe_markdown_links(value)

# Double-escape existing backslashes
# Convert every literal tab into the text “\t”
value = value.replace("\\", "\\\\").replace("\t", r"\t")
return value

def _strip_unsafe_markdown_links(self, value):
"""
Replace markdown links using unsafe schemes with only their link text.

This parser handles nested parentheses, difficult with regex
"""
output = []
cursor = 0

while True:
match = self.UNSAFE_MARKDOWN_LINK_START_RE.search(value, cursor)
if not match:
output.append(value[cursor:])
break

output.append(value[cursor : match.start()])
link_text = match.group(1)

# start after "[text](scheme:"
index = match.end()
depth = 1
while index < len(value) and depth > 0:
if value[index] == "(":
depth += 1
elif value[index] == ")":
depth -= 1
index += 1

if depth == 0:
output.append(link_text)
cursor = index
else:
# in the case of unclosed parens/malformed link
# defuse the protocol name to make it un-executable.
# this is to save trailing textt.
link_text = match.group(1)
protocol = match.group(2)

defused_start = f"[{link_text}](DEFUSED_{protocol}:"
output.append(defused_start)

cursor = match.end()

return "".join(output)

def _clean_structure(self, obj):
"""
Recursively walk a nested structure (dicts, lists, tuples) and clean every string
Expand Down
56 changes: 56 additions & 0 deletions tests/test_dbgap_fhir.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import pytest
import sys
from copy import deepcopy

import requests
from requests.auth import HTTPBasicAuth
Expand Down Expand Up @@ -157,3 +158,58 @@ def _mock_request(path, **kwargs):
assert _get_tsv_data(file_name) == _get_tsv_data(
"tests/test_data/fhir_metadata.tsv"
)


def test_dbgap_fhir_sanitizes_unsafe_markdown_links():
dbgap_fhir = dbgapFHIR(
api="https://example.com/fhir/x1",
auth_provider=HTTPBasicAuth("DATACITE_USERNAME", "DATACITE_PASSWORD"),
)

unsafe_response = deepcopy(MOCK_NIH_DBGAP_FHIR_RESPONSE_FOR_PHS000007)
unsafe_response[
"description"
] = "Description with [malformed JS](javascript:getPage(this, 'document.cgi', 2022 and also [unsafe link](javascript:getPage(this, 'document.cgi', 2022);return true;) and context"
unsafe_response["keyword"][0][
"text"
] = "[JS](javascript:getPage(this, 'document.cgi', 2022);return true;)"
unsafe_response["keyword"][1]["text"] = "[VB](vbscript:msgbox(1))"
unsafe_response["keyword"][2]["text"] = "[DATA](data:text/html;base64,PHNjcmlwdD4=)"

clean_response = deepcopy(MOCK_NIH_DBGAP_FHIR_RESPONSE_FOR_PHS000166)

def _mock_request(path, **kwargs):
if path == "ResearchStudy/phs000007":
return unsafe_response
if path == "ResearchStudy/phs000166":
return clean_response

assert path in ["ResearchStudy/phs000007", "ResearchStudy/phs000166"]

dbgap_fhir.fhir_client.server.request_json = MagicMock(side_effect=_mock_request)

metadata = dbgap_fhir.get_metadata_for_ids(
[
"phs000007.v1.p1.c1",
"phs000166.c3",
]
)

unsafe_metadata = metadata["phs000007.v1.p1.c1"]
assert (
unsafe_metadata["Description"]
== "Description with [malformed JS](DEFUSED_javascript:getPage(this, 'document.cgi', 2022 and also unsafe link and context"
)

assert isinstance(unsafe_metadata["Keyword"], list)
for item in unsafe_metadata["Keyword"]:
assert "javascript:" not in item.lower()
assert "vbscript:" not in item.lower()
assert "data:text/html" not in item.lower()

assert "JS" in unsafe_metadata["Keyword"]
assert "VB" in unsafe_metadata["Keyword"]
assert "DATA" in unsafe_metadata["Keyword"]

clean_metadata = metadata["phs000166.c3"]
assert clean_metadata["Description"] == clean_response["description"]
Loading