From 2ca3e68398a8a0505fc2d0c2d5e02f259f058b42 Mon Sep 17 00:00:00 2001
From: Aishwarya-Tonpe <aishwarya.tonpe25@gmail.com>
Date: Wed, 29 Apr 2026 23:37:02 +0000
Subject: [PATCH 01/12] feat: Add HuggingFace Hub model support for ORT and
 TensorRT inference benchmarks

- Add HuggingFaceModelLoader for downloading and caching models from HF Hub
- Support both NLP (AutoModelForCausalLM) and vision (AutoModelForImageClassification) models
- Add model_source and model_identifier parameters to TensorRT/ORT benchmarks
- Add ONNX export pipeline for HuggingFace models with dynamic axes
- Derive vision input shapes from ONNX graph dims with HF config fallback
- Filter ONNX initializers from graph.input for correct NLP input handling
- Add PyTorch 2.8+ compatibility (external_data vs use_external_data_format)
- Add example script, unit tests, and config schema updates
- Support HF_TOKEN env var for gated model access
---
 .../benchmarks/ort_inference_performance.py   |  73 ++-
 .../tensorrt_inference_performance.py         |  80 +++-
 .../micro_benchmarks/_export_torch_to_onnx.py | 168 ++++++-
 .../huggingface_model_loader.py               | 429 ++++++++++++++++++
 .../micro_benchmarks/model_source_config.py   |  89 ++++
 .../ort_inference_performance.py              | 164 ++++++-
 .../tensorrt_inference_performance.py         | 185 +++++++-
 .../micro_benchmarks/test_huggingface_e2e.py  | 103 +++++
 .../test_huggingface_loader.py                | 117 +++++
 .../test_model_source_config.py               |  73 +++
 tests/helper/decorator.py                     |   1 +
 11 files changed, 1467 insertions(+), 15 deletions(-)
 create mode 100644 superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
 create mode 100644 superbench/benchmarks/micro_benchmarks/model_source_config.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_huggingface_loader.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_model_source_config.py

diff --git a/examples/benchmarks/ort_inference_performance.py b/examples/benchmarks/ort_inference_performance.py
index 18bda2043..82cd6dec0 100644
--- a/examples/benchmarks/ort_inference_performance.py
+++ b/examples/benchmarks/ort_inference_performance.py
@@ -4,13 +4,30 @@
 """Micro benchmark example for ONNXRuntime inference performance.
 
 Commands to run:
+  In-house models:
     python3 examples/benchmarks/ort_inference_performance.py
+    python3 examples/benchmarks/ort_inference_performance.py --model_source in-house
+
+  HuggingFace models:
+    python3 examples/benchmarks/ort_inference_performance.py \
+      --model_source huggingface --model_identifier bert-base-uncased
+    python3 examples/benchmarks/ort_inference_performance.py \
+      --model_source huggingface --model_identifier microsoft/resnet-50
+    python3 examples/benchmarks/ort_inference_performance.py \
+      --model_source huggingface --model_identifier deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+
+Environment variables:
+  HF_TOKEN: HuggingFace token for gated models (optional)
 """
 
+import argparse
+
 from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.common.utils import logger
 
-if __name__ == '__main__':
+
+def run_inhouse_benchmark():
+    """Run ORT inference with in-house torchvision models."""
     context = BenchmarkRegistry.create_benchmark_context(
         'ort-inference', platform=Platform.CUDA, parameters='--pytorch_models resnet50 resnet101 --precision float16'
     )
@@ -21,3 +38,57 @@
                 benchmark.name, benchmark.return_code, benchmark.result
             )
         )
+    return benchmark
+
+
+def run_huggingface_benchmark(model_identifier, precision='float16', batch_size=32, seq_length=512):
+    """Run ORT inference with a HuggingFace model.
+
+    Args:
+        model_identifier: HuggingFace model ID (e.g., 'bert-base-uncased').
+        precision: Inference precision ('float32', 'float16', 'int8').
+        batch_size: Batch size for inference.
+        seq_length: Sequence length for transformer models.
+    """
+    parameters = (
+        f'--model_source huggingface '
+        f'--model_identifier {model_identifier} '
+        f'--precision {precision} '
+        f'--batch_size {batch_size} '
+        f'--seq_length {seq_length}'
+    )
+
+    logger.info(f'Running ORT inference benchmark with HuggingFace model: {model_identifier}')
+
+    context = BenchmarkRegistry.create_benchmark_context('ort-inference', platform=Platform.CUDA, parameters=parameters)
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
+    return benchmark
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ORT inference benchmark')
+    parser.add_argument(
+        '--model_source',
+        type=str,
+        default='in-house',
+        choices=['in-house', 'huggingface'],
+        help='Source of the model: in-house (default) or huggingface'
+    )
+    parser.add_argument(
+        '--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier'
+    )
+    parser.add_argument('--precision', type=str, default='float16', choices=['float32', 'float16', 'int8'])
+    parser.add_argument('--batch_size', type=int, default=32)
+    parser.add_argument('--seq_length', type=int, default=512)
+    args = parser.parse_args()
+
+    if args.model_source == 'huggingface':
+        run_huggingface_benchmark(args.model_identifier, args.precision, args.batch_size, args.seq_length)
+    else:
+        run_inhouse_benchmark()
diff --git a/examples/benchmarks/tensorrt_inference_performance.py b/examples/benchmarks/tensorrt_inference_performance.py
index cacbf1177..4385a728e 100644
--- a/examples/benchmarks/tensorrt_inference_performance.py
+++ b/examples/benchmarks/tensorrt_inference_performance.py
@@ -4,13 +4,30 @@
 """Micro benchmark example for TensorRT inference performance.
 
 Commands to run:
+  In-house models:
     python3 examples/benchmarks/tensorrt_inference_performance.py
+    python3 examples/benchmarks/tensorrt_inference_performance.py --model_source in-house
+
+  HuggingFace models:
+    python3 examples/benchmarks/tensorrt_inference_performance.py \
+      --model_source huggingface --model_identifier bert-base-uncased
+    python3 examples/benchmarks/tensorrt_inference_performance.py \
+      --model_source huggingface --model_identifier microsoft/resnet-50
+    python3 examples/benchmarks/tensorrt_inference_performance.py \
+      --model_source huggingface --model_identifier deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+
+Environment variables:
+  HF_TOKEN: HuggingFace token for gated models (optional)
 """
 
+import argparse
+
 from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.common.utils import logger
 
-if __name__ == '__main__':
+
+def run_inhouse_benchmark():
+    """Run TensorRT inference with in-house torchvision models."""
     context = BenchmarkRegistry.create_benchmark_context('tensorrt-inference', platform=Platform.CUDA)
     benchmark = BenchmarkRegistry.launch_benchmark(context)
     if benchmark:
@@ -19,3 +36,64 @@
                 benchmark.name, benchmark.return_code, benchmark.result
             )
         )
+    return benchmark
+
+
+def run_huggingface_benchmark(model_identifier, precision='fp16', batch_size=32, seq_length=512, iterations=2048):
+    """Run TensorRT inference with a HuggingFace model.
+
+    Args:
+        model_identifier: HuggingFace model ID (e.g., 'bert-base-uncased').
+        precision: Inference precision ('fp32', 'fp16', 'int8').
+        batch_size: Batch size for inference.
+        seq_length: Sequence length for transformer models.
+        iterations: Number of inference iterations.
+    """
+    parameters = (
+        f'--model_source huggingface '
+        f'--model_identifier {model_identifier} '
+        f'--precision {precision} '
+        f'--batch_size {batch_size} '
+        f'--seq_length {seq_length} '
+        f'--iterations {iterations}'
+    )
+
+    logger.info(f'Running TensorRT inference benchmark with HuggingFace model: {model_identifier}')
+
+    context = BenchmarkRegistry.create_benchmark_context(
+        'tensorrt-inference', platform=Platform.CUDA, parameters=parameters
+    )
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
+    return benchmark
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='TensorRT inference benchmark')
+    parser.add_argument(
+        '--model_source',
+        type=str,
+        default='in-house',
+        choices=['in-house', 'huggingface'],
+        help='Source of the model: in-house (default) or huggingface'
+    )
+    parser.add_argument(
+        '--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier'
+    )
+    parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'int8'])
+    parser.add_argument('--batch_size', type=int, default=32)
+    parser.add_argument('--seq_length', type=int, default=512)
+    parser.add_argument('--iterations', type=int, default=2048)
+    args = parser.parse_args()
+
+    if args.model_source == 'huggingface':
+        run_huggingface_benchmark(
+            args.model_identifier, args.precision, args.batch_size, args.seq_length, args.iterations
+        )
+    else:
+        run_inhouse_benchmark()
diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
index 876d2ccfe..ab94f74e7 100644
--- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
+++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
@@ -3,28 +3,30 @@
 
 """Export PyTorch models to ONNX format."""
 
+import inspect
 from pathlib import Path
 
 from packaging import version
 import torch.hub
 import torch.onnx
 import torchvision.models
-from transformers import BertConfig, GPT2Config, LlamaConfig
 
-from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel
-from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel
-from superbench.benchmarks.model_benchmarks.pytorch_lstm import LSTMBenchmarkModel
-from superbench.benchmarks.model_benchmarks.pytorch_llama import LlamaBenchmarkModel
-from superbench.benchmarks.model_benchmarks.pytorch_mixtral import MixtralBenchmarkModel
+import traceback
 
-if MixtralBenchmarkModel is not None:
-    from transformers import MixtralConfig
+from superbench.common.utils import logger
 
 
 class torch2onnxExporter():
     """PyTorch model to ONNX exporter."""
     def __init__(self):
         """Constructor."""
+        from transformers import BertConfig, GPT2Config, LlamaConfig
+        from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel
+        from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel
+        from superbench.benchmarks.model_benchmarks.pytorch_lstm import LSTMBenchmarkModel
+        from superbench.benchmarks.model_benchmarks.pytorch_llama import LlamaBenchmarkModel
+        from superbench.benchmarks.model_benchmarks.pytorch_mixtral import MixtralBenchmarkModel
+
         self.num_classes = 100
         self.lstm_input_size = 256
         self.benchmark_models = {
@@ -129,6 +131,7 @@ def __init__(self):
 
         # Only include Mixtral models if MixtralBenchmarkModel is available
         if MixtralBenchmarkModel is not None:
+            from transformers import MixtralConfig
             self.benchmark_models.update(
                 {
                     'mixtral-8x7b':
@@ -270,3 +273,152 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
         del dummy_input
         torch.cuda.empty_cache()
         return file_name
+
+    def export_huggingface_model(self, model, model_name, batch_size=1, seq_length=512, output_dir=None):
+        """Export a HuggingFace model to ONNX format.
+
+        Args:
+            model: HuggingFace model instance to export.
+            model_name (str): Name for the exported ONNX model file.
+            batch_size (int): Batch size of input. Defaults to 1.
+            seq_length (int): Sequence length of input. Defaults to 512.
+            output_dir (str): Output directory path. If None, uses default path.
+
+        Returns:
+            str: Exported ONNX model file path, or empty string if export fails.
+        """
+        try:
+            # Use custom output directory if provided
+            output_path = Path(output_dir) if output_dir else self._onnx_model_path
+            file_name = str(output_path / (model_name + '.onnx'))
+
+            # Put model in eval mode and move to CUDA if available
+            model.eval()
+
+            # Disable cache to avoid DynamicCache issues with ONNX export
+            if hasattr(model.config, 'use_cache'):
+                model.config.use_cache = False
+
+            if torch.cuda.is_available():
+                model = model.cuda()
+
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+            # Get model's dtype for inputs
+            model_dtype = next(model.parameters()).dtype
+
+            # Detect model type and create appropriate inputs
+            # Vision models use pixel_values, NLP models use input_ids
+            # Use HuggingFace's main_input_name property for automatic detection
+            main_input = getattr(model, 'main_input_name', 'input_ids')
+            is_vision_model = main_input == 'pixel_values'
+
+            if is_vision_model:
+                # Vision models: use pixel_values (batch_size, channels, height, width)
+                # Derive C/H/W from model config rather than hard-coding 3x224x224
+                num_channels = getattr(model.config, 'num_channels', 3)
+                image_size = getattr(model.config, 'image_size', 224)
+                if isinstance(image_size, (list, tuple)):
+                    img_h, img_w = image_size[0], image_size[1]
+                else:
+                    img_h, img_w = image_size, image_size
+
+                dummy_input = torch.randn(batch_size, num_channels, img_h, img_w, dtype=model_dtype, device=device)
+                input_names = ['pixel_values']
+                dynamic_axes = {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
+
+                # Wrapper for vision models
+                class VisionModelWrapper(torch.nn.Module):
+                    def __init__(self, model):
+                        super().__init__()
+                        self.model = model
+
+                    def forward(self, pixel_values):
+                        outputs = self.model(pixel_values=pixel_values)
+                        if hasattr(outputs, 'logits'):
+                            return outputs.logits
+                        elif hasattr(outputs, 'last_hidden_state'):
+                            return outputs.last_hidden_state
+                        else:
+                            return outputs[0] if isinstance(outputs, (tuple, list)) else outputs
+
+                wrapped_model = VisionModelWrapper(model)
+                export_args = (dummy_input, )
+            else:
+                # NLP models: use input_ids and attention_mask
+                dummy_input = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device)
+                attention_mask = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device)
+                input_names = ['input_ids', 'attention_mask']
+                dynamic_axes = {
+                    'input_ids': {
+                        0: 'batch_size',
+                        1: 'seq_length'
+                    },
+                    'attention_mask': {
+                        0: 'batch_size',
+                        1: 'seq_length'
+                    },
+                    'output': {
+                        0: 'batch_size',
+                        1: 'seq_length'
+                    },
+                }
+
+                # Wrapper for NLP models
+                class NLPModelWrapper(torch.nn.Module):
+                    def __init__(self, model):
+                        super().__init__()
+                        self.model = model
+
+                    def forward(self, input_ids, attention_mask):
+                        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+                        if hasattr(outputs, 'logits'):
+                            return outputs.logits
+                        elif hasattr(outputs, 'last_hidden_state'):
+                            return outputs.last_hidden_state
+                        else:
+                            return outputs[0] if isinstance(outputs, (tuple, list)) else outputs
+
+                wrapped_model = NLPModelWrapper(model)
+                export_args = (dummy_input, attention_mask)
+
+            # Export to ONNX for large models (>2GB), use external data format
+            model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3)
+            use_external_data = model_size_gb > 2.0
+
+            if use_external_data:
+                logger.info(f'Model size is {model_size_gb:.2f}GB, using external data format for ONNX export')
+
+            export_kwargs = {
+                'opset_version': 14,
+                'do_constant_folding': True,
+                'input_names': input_names,
+                'output_names': ['output'],
+                'dynamic_axes': dynamic_axes,
+            }
+            if use_external_data:
+                # PyTorch 2.8+ renamed 'use_external_data_format' to 'external_data'
+                sig = inspect.signature(torch.onnx.export)
+                if 'external_data' in sig.parameters:
+                    export_kwargs['external_data'] = True
+                else:
+                    export_kwargs['use_external_data_format'] = True
+
+            torch.onnx.export(
+                wrapped_model,
+                export_args,
+                file_name,
+                **export_kwargs,
+            )
+
+            # Clean up
+            del dummy_input
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+            return file_name
+
+        except Exception as e:
+            logger.error(f'Failed to export HuggingFace model to ONNX: {str(e)}')
+            logger.error(traceback.format_exc())
+            return ''
diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
new file mode 100644
index 000000000..9d8c55359
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
@@ -0,0 +1,429 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Hugging Face model loader for benchmarking."""
+
+import os
+from pathlib import Path
+from typing import Optional, Tuple
+
+import torch
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoConfig,
+    AutoTokenizer,
+    PreTrainedModel,
+    PretrainedConfig,
+)
+
+from superbench.common.utils import logger
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+
+
+class ModelLoadError(Exception):
+    """Exception raised when model loading fails."""
+    pass
+
+
+class ModelNotFoundError(ModelLoadError):
+    """Exception raised when model is not found."""
+    pass
+
+
+class ModelIncompatibleError(ModelLoadError):
+    """Exception raised when model is incompatible with ONNX export."""
+    pass
+
+
+class HuggingFaceModelLoader:
+    """Loads models from Hugging Face Hub for benchmarking.
+
+    This class handles downloading, caching, and loading models from
+    Hugging Face Hub with support for authentication, device mapping,
+    and compatibility validation.
+
+    Attributes:
+        cache_dir: Directory to cache downloaded models.
+        token: HuggingFace authentication token for private/gated models.
+    """
+    def __init__(self, cache_dir: Optional[str] = None, token: Optional[str] = None):
+        """Initialize the HuggingFace model loader.
+
+        Args:
+            cache_dir: Directory to cache downloaded models. If None, uses HF default.
+            token: HuggingFace authentication token for private/gated models.
+        """
+        self.cache_dir = cache_dir or os.getenv('HF_HOME') or os.path.expanduser('~/.cache/huggingface')
+        self.token = token or os.getenv('HF_TOKEN') or os.getenv('HUGGING_FACE_HUB_TOKEN')
+
+        # Ensure cache directory exists
+        Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
+
+        logger.info(f'HuggingFaceModelLoader initialized with cache_dir: {self.cache_dir}')
+        if self.token:
+            logger.info('Authentication token provided for private/gated models (token not logged)')
+
+    def load_model(
+        self,
+        model_identifier: str,
+        torch_dtype: Optional[str] = None,
+        device: str = 'cuda',
+        revision: Optional[str] = None,
+        device_map: Optional[str] = None,
+        config: Optional[PretrainedConfig] = None,
+        **kwargs
+    ) -> Tuple[PreTrainedModel, PretrainedConfig, Optional[AutoTokenizer]]:
+        """Load a model from Hugging Face Hub.
+
+        Args:
+            model_identifier: HF model ID (e.g., 'meta-llama/Llama-2-7b-hf').
+            torch_dtype: Data type for model weights ('float32', 'float16', 'bfloat16').
+            device: Device to load model on ('cuda', 'cpu').
+            revision: Specific model version/commit/tag to use.
+            device_map: Device mapping strategy for large models.
+            config: Pre-downloaded model config. If None, downloads from Hub.
+            **kwargs: Additional arguments passed to from_pretrained().
+
+        Returns:
+            Tuple of (model, config, tokenizer).
+
+        Raises:
+            ModelNotFoundError: If model doesn't exist on HF Hub.
+            ModelLoadError: If model loading fails for any reason.
+        """
+        logger.info(f'Loading model: {model_identifier}')
+
+        try:
+            # Convert torch_dtype string to torch dtype
+            dtype = self._get_torch_dtype(torch_dtype) if torch_dtype else None
+
+            # Prepare loading kwargs
+            load_kwargs = {'cache_dir': self.cache_dir, 'revision': revision, **kwargs}
+
+            # Add token if available
+            if self.token:
+                load_kwargs['token'] = self.token
+
+            # Add dtype if specified
+            if dtype:
+                load_kwargs['torch_dtype'] = dtype
+
+            # Load config (use pre-downloaded config if provided)
+            if config is None:
+                logger.info('Loading model configuration...')
+                config = AutoConfig.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs)
+            else:
+                logger.info('Using pre-downloaded model configuration.')
+
+            # Load tokenizer (may fail for some models, that's ok)
+            tokenizer = None
+            try:
+                logger.info('Loading tokenizer...')
+                tokenizer = AutoTokenizer.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs)
+            except Exception as e:
+                logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.')
+
+            # Load model
+            logger.info(f'Loading model weights (dtype={torch_dtype}, device={device})...')
+            model_kwargs = load_kwargs.copy()
+            model_kwargs['trust_remote_code'] = True
+
+            # Handle device mapping for large models
+            effective_device_map = device_map
+            if device_map:
+                model_kwargs['device_map'] = device_map
+            elif device == 'cuda' and torch.cuda.is_available():
+                # Don't set device_map if device is explicitly cuda
+                pass
+            elif device != 'cpu':
+                model_kwargs['device_map'] = device
+                effective_device_map = device
+
+            # Pass pre-downloaded config to from_pretrained so any overrides take effect
+            if config is not None:
+                model_kwargs['config'] = config
+
+            try:
+                model = AutoModel.from_pretrained(model_identifier, **model_kwargs)
+            except ValueError:
+                logger.info('AutoModel failed, trying AutoModelForCausalLM...')
+                model = AutoModelForCausalLM.from_pretrained(model_identifier, **model_kwargs)
+
+            # Move to device if not using device_map
+            if not effective_device_map and device != 'auto':
+                model = model.to(device)
+
+            logger.info(
+                f'Successfully loaded model: {model_identifier} '
+                f'({self._get_model_size(model):.2f}M parameters)'
+            )
+
+            return model, config, tokenizer
+
+        except OSError as e:
+            if 'not found' in str(e).lower() or '404' in str(e):
+                raise ModelNotFoundError(
+                    f"Model '{model_identifier}' not found on Hugging Face Hub. "
+                    f'Please check the model ID at https://huggingface.co/models'
+                ) from e
+            raise ModelLoadError(f"Failed to load model '{model_identifier}': {e}") from e
+        except Exception as e:
+            raise ModelLoadError(f"Unexpected error loading model '{model_identifier}': {e}") from e
+
+    def load_model_from_config(
+        self,
+        config: ModelSourceConfig,
+        device: Optional[str] = None,
+        config_pretrained: Optional[PretrainedConfig] = None,
+    ) -> Tuple[PreTrainedModel, PretrainedConfig, Optional[AutoTokenizer]]:
+        """Load a model using ModelSourceConfig.
+
+        Args:
+            config: ModelSourceConfig instance with loading parameters.
+            device: Device to load model on. If None, uses CUDA when available, else CPU.
+            config_pretrained: Pre-downloaded HF model config. If provided, skips redundant download.
+
+        Returns:
+            Tuple of (model, config, tokenizer).
+
+        Raises:
+            ValueError: If config source is not 'huggingface'.
+            ModelLoadError: If model loading fails.
+        """
+        if not config.is_huggingface():
+            raise ValueError(f"Cannot load model with source '{config.source}'. Use 'huggingface' source.")
+
+        # Validate config
+        is_valid, error = config.validate()
+        if not is_valid:
+            raise ValueError(f'Invalid configuration: {error}')
+
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+        # Extract loading parameters
+        return self.load_model(
+            model_identifier=config.identifier,
+            torch_dtype=config.torch_dtype,
+            device=device,
+            revision=config.revision,
+            device_map=config.device_map,
+            config=config_pretrained,
+            **config.additional_kwargs
+        )
+
+    def _get_torch_dtype(self, dtype_str: str) -> torch.dtype:
+        """Convert dtype string to torch.dtype.
+
+        Args:
+            dtype_str: String representation of dtype ('float32', 'float16', etc.).
+
+        Returns:
+            Corresponding torch.dtype.
+
+        Raises:
+            ValueError: If dtype string is invalid or unsupported for standard HF loading.
+        """
+        normalized_dtype = dtype_str.lower()
+        if normalized_dtype == 'int8':
+            raise ValueError(
+                "Unsupported dtype 'int8' for Hugging Face model loading via torch_dtype. "
+                'Use a dedicated quantization/loading path for int8 models or apply int8 quantization '
+                'after export.'
+            )
+        dtype_map = {
+            'float32': torch.float32,
+            'float16': torch.float16,
+            'bfloat16': torch.bfloat16,
+            'fp32': torch.float32,
+            'fp16': torch.float16,
+            'bf16': torch.bfloat16,
+        }
+
+        if normalized_dtype not in dtype_map:
+            raise ValueError(f"Invalid dtype '{dtype_str}'.Must be one of {list(dtype_map.keys())}")
+
+        return dtype_map[normalized_dtype]
+
+    def _get_model_size(self, model: PreTrainedModel) -> float:
+        """Calculate model size in millions of parameters.
+
+        Args:
+            model: The model to measure.
+
+        Returns:
+            Number of parameters in millions.
+        """
+        return float(sum(p.numel() for p in model.parameters())) / 1_000_000
+
+    @staticmethod
+    def estimate_param_count_from_config(hf_config) -> Optional[int]:
+        """Estimate parameter count from a HuggingFace config without instantiating the model.
+
+        This avoids allocating tens/hundreds of GB of CPU RAM for large models (e.g. 70B).
+        The estimate covers embedding + transformer layers + LM head for common architectures.
+
+        Args:
+            hf_config: A HuggingFace PretrainedConfig object.
+
+        Returns:
+            int: Estimated number of parameters, or None if estimation is not possible.
+        """
+        try:
+            vocab = getattr(hf_config, 'vocab_size', 0)
+            hidden = getattr(hf_config, 'hidden_size', 0)
+            layers = getattr(hf_config, 'num_hidden_layers', 0)
+            intermediate = getattr(hf_config, 'intermediate_size', hidden * 4)
+            num_heads = getattr(hf_config, 'num_attention_heads', 0)
+            num_kv_heads = getattr(hf_config, 'num_key_value_heads', num_heads)
+            head_dim = hidden // num_heads if num_heads > 0 else 0
+
+            if vocab == 0 or hidden == 0 or layers == 0:
+                return None
+
+            # Embeddings: token + (optional) position
+            max_pos = getattr(hf_config, 'max_position_embeddings', 0)
+            has_pos_embed = getattr(hf_config, 'position_embedding_type', None) not in ('rotary', None)
+            embed_params = vocab * hidden
+            if has_pos_embed and max_pos > 0:
+                embed_params += max_pos * hidden
+
+            # Per transformer layer:
+            #   Self-attention: Q, K, V projections + output projection
+            #   MLP: gate_proj + up_proj + down_proj (LLaMA-style) or fc1 + fc2
+            #   Layer norms: 2 * hidden
+            qkv_params = (num_heads * head_dim + 2 * num_kv_heads * head_dim) * hidden
+            attn_out = hidden * hidden
+            # For gated MLPs (LLaMA/Mistral), there are 3 matrices; otherwise 2
+            has_gate = getattr(hf_config, 'hidden_act', 'gelu') in ('silu', 'swiglu')
+            mlp_params = (3 if has_gate else 2) * hidden * intermediate
+            norm_params = 2 * hidden
+            layer_params = qkv_params + attn_out + mlp_params + norm_params
+
+            # MoE: if num_local_experts > 1, MLP is replicated per expert
+            num_experts = getattr(hf_config, 'num_local_experts', 1)
+            if num_experts > 1:
+                # Router + replicated MLP experts (attention is shared)
+                router_params = hidden * num_experts
+                layer_params = qkv_params + attn_out + norm_params + \
+                    num_experts * mlp_params + router_params
+
+            total_params = embed_params + layers * layer_params
+            # LM head (often tied to embedding, but count it for safety)
+            total_params += vocab * hidden
+            # Final layer norm
+            total_params += hidden
+
+            return total_params
+        except Exception as e:
+            logger.warning(f'Could not estimate param count from config: {e}')
+            return None
+
+    @staticmethod
+    def estimate_memory(param_count, precision_str, mode='training'):
+        """Estimate GPU memory required for a model.
+
+        For training: weights + gradients + optimizer states (Adam uses 2x) = 4x multiplier.
+        For inference: weights only + overhead for runtime buffers = ~1.2x multiplier.
+
+        Args:
+            param_count (int): Number of model parameters.
+            precision_str (str): Precision string ('float32', 'float16', 'bfloat16', 'fp16', 'fp32', 'int8').
+            mode (str): 'training' or 'inference'.
+
+        Returns:
+            tuple: (estimated_bytes, gpu_total_bytes, fits) where fits is True if
+                   the model is estimated to fit in available memory.
+        """
+        precision_lower = precision_str.lower()
+        if precision_lower in ('float16', 'fp16', 'bfloat16', 'bf16'):
+            bytes_per_param = 2
+        elif precision_lower in ('int8', ):
+            bytes_per_param = 1
+        else:
+            bytes_per_param = 4
+
+        if mode == 'training':
+            # weights + gradients + 2x Adam optimizer states = 4x
+            multiplier = 4
+        else:
+            # inference: weights + runtime overhead (~20%)
+            multiplier = 1.2
+
+        estimated_bytes = int(param_count * bytes_per_param * multiplier)
+
+        gpu_available = torch.cuda.is_available()
+        if not gpu_available:
+            try:
+                import psutil
+                sys_mem = psutil.virtual_memory().total
+            except ImportError:
+                logger.warning('psutil not installed — cannot check system memory. Skipping memory check.')
+                return estimated_bytes, 0, True
+            max_gpu_mem = 80 * (1024**3)    # 80GB — largest common single-GPU memory
+            effective_mem = min(sys_mem, max_gpu_mem)
+            fits = (estimated_bytes / effective_mem) < 0.85
+            return estimated_bytes, effective_mem, fits
+
+        gpu_mem = torch.cuda.get_device_properties(0).total_memory
+        # Use 85% threshold to leave headroom for activations, framework overhead, etc.
+        fits = (estimated_bytes / gpu_mem) < 0.85
+        return estimated_bytes, gpu_mem, fits
+
+    @staticmethod
+    def check_memory_fits(model_identifier, hf_config, precision_str, mode='training', token=None):
+        """Check if a model fits in GPU memory before downloading weights.
+
+        Downloads only the config (few KB) via hf_config, estimates memory, and returns
+        whether the model fits. Use this before calling load_model() to avoid wasting
+        time downloading large models that won't fit.
+
+        Args:
+            model_identifier (str): HF model ID (for logging).
+            hf_config: A HuggingFace PretrainedConfig object.
+            precision_str (str): Precision string ('float32', 'float16', etc.).
+            mode (str): 'training' or 'inference'.
+            token (str, optional): HF token (unused, kept for API consistency).
+
+        Returns:
+            tuple: (fits, param_count_millions, estimated_gb, available_gb)
+                   fits is True if model is estimated to fit.
+        """
+        param_count = HuggingFaceModelLoader.estimate_param_count_from_config(hf_config)
+        if param_count is None:
+            logger.warning(
+                f'Could not estimate param count from config for {model_identifier}. '
+                f'Proceeding with download — memory check skipped.'
+            )
+            return True, 0, 0, 0
+
+        estimated_bytes, available_bytes, fits = HuggingFaceModelLoader.estimate_memory(
+            param_count, precision_str, mode=mode
+        )
+
+        param_millions = param_count / 1e6
+        estimated_gb = estimated_bytes / 1e9
+        available_gb = available_bytes / 1e9
+
+        if fits:
+            logger.info(
+                f'Model {model_identifier} ({param_millions:.1f}M params) estimated to need '
+                f'~{estimated_gb:.1f}GB for {mode}, fits in available memory ({available_gb:.1f}GB).'
+            )
+        else:
+            mem_type = 'GPU memory' if torch.cuda.is_available() else 'system RAM'
+            logger.error(
+                f'Model {model_identifier} ({param_millions:.1f}M params) estimated to need '
+                f'~{estimated_gb:.1f}GB for {mode} (weights'
+                f'{" + gradients + optimizer states" if mode == "training" else " + runtime overhead"}), '
+                f'which exceeds available {mem_type} ({available_gb:.1f}GB). '
+                f'Skipping benchmark. Use a smaller model variant or a machine with more memory.'
+            )
+
+        return fits, param_millions, estimated_gb, available_gb
+
+    def __repr__(self) -> str:
+        """String representation of the loader."""
+        token_status = 'authenticated' if self.token else 'no authentication'
+        return f"HuggingFaceModelLoader(cache_dir='{self.cache_dir}', {token_status})"
diff --git a/superbench/benchmarks/micro_benchmarks/model_source_config.py b/superbench/benchmarks/micro_benchmarks/model_source_config.py
new file mode 100644
index 000000000..99ca31870
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/model_source_config.py
@@ -0,0 +1,89 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Configuration classes for model source and loading."""
+
+from dataclasses import dataclass, field
+from typing import Optional, Dict, Any, Tuple
+
+
+@dataclass
+class ModelSourceConfig:
+    """Configuration for model source and loading parameters.
+
+    This class encapsulates all configuration needed to load a model
+    from either in-house definitions or Hugging Face Hub.
+
+    Attributes:
+        source: Source of the model ('in-house' or 'huggingface').
+        identifier: Model name (in-house) or model ID (HuggingFace).
+        hf_token: Optional HuggingFace authentication token for private/gated models.
+        torch_dtype: Data type for model weights ('float32', 'float16', 'bfloat16').
+        revision: Specific model version/commit/tag to use.
+        cache_dir: Directory to cache downloaded models.
+        device_map: Device mapping strategy for model loading.
+        use_auth_token: Deprecated, use hf_token instead.
+        additional_kwargs: Additional keyword arguments for model loading.
+    """
+
+    source: str = 'in-house'
+    identifier: str = ''
+    hf_token: Optional[str] = None
+    torch_dtype: str = 'float32'
+    revision: Optional[str] = None
+    cache_dir: Optional[str] = None
+    device_map: Optional[str] = None
+    use_auth_token: Optional[str] = None    # Deprecated
+    additional_kwargs: Dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self):
+        """Post-initialization validation and normalization."""
+        # Handle deprecated use_auth_token
+        if self.use_auth_token is not None and self.hf_token is None:
+            self.hf_token = self.use_auth_token
+
+        # Normalize and validate source
+        self.source = self.source.lower()
+        if self.source not in ['in-house', 'huggingface']:
+            raise ValueError(f"Invalid model source '{self.source}'. Must be 'in-house' or 'huggingface'.")
+
+        # Validate torch_dtype
+        valid_dtypes = ['float32', 'float16', 'bfloat16', 'int8']
+        if self.torch_dtype not in valid_dtypes:
+            raise ValueError(f"Invalid torch_dtype '{self.torch_dtype}'. Must be one of {valid_dtypes}.")
+
+        # Validate identifier is provided
+        if not self.identifier:
+            raise ValueError('Model identifier must be provided.')
+
+    def validate(self) -> Tuple[bool, str]:
+        """Validate configuration parameters.
+
+        Returns:
+            Tuple of (is_valid, error_message).
+            If is_valid is True, error_message is empty.
+        """
+        # Check identifier is not empty for HuggingFace models
+        if self.source == 'huggingface':
+            if not self.identifier or not self.identifier.strip():
+                return (False, 'HuggingFace model identifier cannot be empty')
+
+        return (True, '')
+
+    def is_huggingface(self) -> bool:
+        """Check if this configuration is for a HuggingFace model.
+
+        Returns:
+            True if source is 'huggingface', False otherwise.
+        """
+        return self.source == 'huggingface'
+
+    def __repr__(self) -> str:
+        """String representation of the configuration."""
+        token_status = 'set' if self.hf_token else 'not set'
+        return (
+            f"ModelSourceConfig(source='{self.source}', "
+            f"identifier='{self.identifier}', "
+            f"torch_dtype='{self.torch_dtype}', "
+            f'hf_token={token_status})'
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
index a472af121..2e0fff826 100644
--- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
@@ -14,6 +14,8 @@
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, Precision
 from superbench.benchmarks.micro_benchmarks import MicroBenchmark
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader
 
 
 class ORTInferenceBenchmark(MicroBenchmark):
@@ -96,6 +98,32 @@ def add_parser_arguments(self):
             help='The number of test step for benchmarking.',
         )
 
+        # HuggingFace model arguments
+        self._parser.add_argument(
+            '--model_source',
+            type=str,
+            choices=['in-house', 'huggingface'],
+            default='in-house',
+            required=False,
+            help='Source of the model: in-house (default) or huggingface.',
+        )
+
+        self._parser.add_argument(
+            '--model_identifier',
+            type=str,
+            default=None,
+            required=False,
+            help='Model identifier for HuggingFace models (e.g., bert-base-uncased).',
+        )
+
+        self._parser.add_argument(
+            '--seq_length',
+            type=int,
+            default=512,
+            required=False,
+            help='Sequence length for transformer models.',
+        )
+
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -113,6 +141,11 @@ def _preprocess(self):
             3: ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
         }
 
+        # Handle HuggingFace models if specified
+        if self._args.model_source == 'huggingface':
+            return self._preprocess_huggingface_models()
+
+        # Original in-house model processing
         for model in self._args.pytorch_models:
             if hasattr(torchvision.models, model):
                 data_type = Precision.FLOAT16.value if self._args.precision == Precision.FLOAT16 \
@@ -136,11 +169,118 @@ def _preprocess(self):
 
         return True
 
+    def _preprocess_huggingface_models(self):
+        """Preprocess HuggingFace models for ONNX Runtime inference.
+
+        Returns:
+            bool: True if preprocessing succeeds.
+        """
+        import os
+
+        if not self._args.model_identifier:
+            logger.error('--model_identifier is required when using --model_source huggingface')
+            return False
+
+        try:
+            logger.info(f'Loading HuggingFace model: {self._args.model_identifier}')
+
+            # Step 1: Pre-download memory check — download config only (few KB)
+            from transformers import AutoConfig
+            hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
+            load_kwargs = {}
+            if hf_token:
+                load_kwargs['token'] = hf_token
+            hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs)
+
+            precision_str = self._args.precision.value if self._args.precision != Precision.INT8 else 'float32'
+            fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits(
+                self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token
+            )
+            if not fits:
+                return False
+
+            # Step 2: Proceed with model download and ONNX export
+
+            # Get GPU rank to create unique file paths and avoid race conditions
+            # when multiple processes export the same model simultaneously
+            gpu_rank = os.getenv('CUDA_VISIBLE_DEVICES', '0')
+            proc_rank = os.getenv('PROC_RANK', gpu_rank)
+
+            # Create model source config - load on CPU to avoid accelerate dispatching
+            # model across multiple GPUs which causes device mismatch during ONNX export
+            model_config = ModelSourceConfig(
+                source='huggingface',
+                identifier=self._args.model_identifier,
+                hf_token=hf_token,
+                torch_dtype=self._args.precision.value if self._args.precision != Precision.INT8 else 'float32',
+                device_map=None,
+            )
+
+            # Load model from HuggingFace on CPU
+            loader = HuggingFaceModelLoader()
+            hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu')
+            from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter
+            exporter = torch2onnxExporter()
+
+            model_name = self._args.model_identifier.replace('/', '_')
+
+            # Prepare output path - use proc_rank subdirectory to avoid race conditions
+            # when multiple processes export the same model simultaneously
+            proc_output_path = self.__model_cache_path / f'rank_{proc_rank}'
+            proc_output_path.mkdir(parents=True, exist_ok=True)
+
+            # For INT8, export as float32 first then quantize (matching in-house model behavior).
+            # For other precisions, include precision in the model name directly.
+            if self._args.precision == Precision.INT8:
+                export_precision = Precision.FLOAT32.value
+            else:
+                export_precision = self._args.precision.value
+            model_name_with_precision = f'{model_name}.{export_precision}'
+
+            # Export directly to final destination to avoid path issues with external data
+            onnx_path = exporter.export_huggingface_model(
+                model=hf_model,
+                model_name=model_name_with_precision,
+                batch_size=self._args.batch_size,
+                seq_length=self._args.seq_length,
+                output_dir=str(proc_output_path),
+            )
+
+            if not onnx_path:
+                logger.error(f'Failed to export {self._args.model_identifier} to ONNX')
+                return False
+
+            # Apply INT8 quantization if requested (matching in-house model behavior)
+            if self._args.precision == Precision.INT8:
+                from onnxruntime.quantization import quantize_dynamic
+                quantized_path = str(proc_output_path / f'{model_name}.{Precision.INT8.value}.onnx')
+                quantize_dynamic(onnx_path, quantized_path)
+                logger.info('Applied INT8 quantization to HuggingFace model')
+
+            # Update model list and cache path for benchmarking
+            self._args.pytorch_models = [model_name]
+            self.__model_cache_path = proc_output_path
+
+            logger.info('Successfully prepared HuggingFace model for ORT inference')
+            return True
+
+        except Exception as e:
+            logger.error(f'Failed to prepare HuggingFace model: {str(e)}')
+            import traceback
+            logger.error(traceback.format_exc())
+            return False
+
     def _benchmark(self):
         """Implementation for benchmarking."""
         import onnxruntime as ort
         precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8'}
 
+        # Require CUDAExecutionProvider — this benchmark targets GPU inference
+        available = ort.get_available_providers()
+        if 'CUDAExecutionProvider' not in available:
+            logger.error(f'CUDAExecutionProvider is not available (available: {available}).')
+            return False
+
         for model in self._args.pytorch_models:
             sess_options = ort.SessionOptions()
             sess_options.graph_optimization_level = self.__graph_opt_level[self._args.graph_opt_level]
@@ -177,15 +317,33 @@ def __inference(self, ort_sess):
             elapse_times (List[float]): latency of every iterations.
         """
         precision = np.float16 if self._args.precision == Precision.FLOAT16 else np.float32
-        input_tensor = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision)
+
+        # Get input names from the ONNX session to determine input format
+        input_names = [input.name for input in ort_sess.get_inputs()]
+
+        # Determine input format based on what the model expects
+        if 'pixel_values' in input_names:
+            # Vision model: use pixel_values (batch_size, 3, 224, 224)
+            pixel_values = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision)
+            inputs = {'pixel_values': pixel_values}
+        elif 'input_ids' in input_names:
+            # NLP model: use input_ids and attention_mask
+            seq_len = getattr(self._args, 'seq_length', 512)
+            input_ids = np.random.randint(0, 30000, (self._args.batch_size, seq_len)).astype(np.int64)
+            attention_mask = np.ones((self._args.batch_size, seq_len), dtype=np.int64)
+            inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
+        else:
+            # Default for in-house torchvision models: use 'input' (batch_size, 3, 224, 224)
+            input_tensor = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision)
+            inputs = {'input': input_tensor}
 
         for i in range(self._args.num_warmup):
-            ort_sess.run(None, {'input': input_tensor})
+            ort_sess.run(None, inputs)
 
         elapse_times = list()
         for i in range(self._args.num_steps):
             start = time.time()
-            ort_sess.run(None, {'input': input_tensor})
+            ort_sess.run(None, inputs)
             end = time.time()
             elapse_times.append((end - start) * 1000)
 
diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
index 4d5a5b4b7..5153073a3 100644
--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -3,13 +3,18 @@
 
 """TensorRT inference micro-benchmark."""
 
+import os
 import re
 from pathlib import Path
 
+import torch
+
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
 from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
 from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader
 
 
 class TensorRTInferenceBenchmark(MicroBenchmarkWithInvoke):
@@ -71,6 +76,24 @@ def add_parser_arguments(self):
             help='Run at least N inference iterations.',
         )
 
+        # HuggingFace model arguments
+        self._parser.add_argument(
+            '--model_source',
+            type=str,
+            choices=['in-house', 'huggingface'],
+            default='in-house',
+            required=False,
+            help='Source of the model: in-house (default) or huggingface.',
+        )
+
+        self._parser.add_argument(
+            '--model_identifier',
+            type=str,
+            default=None,
+            required=False,
+            help='Model identifier for HuggingFace models (e.g., bert-base-uncased).',
+        )
+
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -82,6 +105,11 @@ def _preprocess(self):
 
         self.__bin_path = str(Path(self._args.bin_dir) / self._bin_name)
 
+        # Handle HuggingFace models if specified
+        if self._args.model_source == 'huggingface':
+            return self._preprocess_huggingface_models()
+
+        # Original in-house model processing
         exporter = torch2onnxExporter()
         for model in self._args.pytorch_models:
             if not (exporter.check_torchvision_model(model) or exporter.check_benchmark_model(model)):
@@ -102,9 +130,8 @@ def _preprocess(self):
                 # model options
                 f'--onnx={onnx_model}',
                 # build options
-                '--explicitBatch',
                 f'--optShapes=input:{input_shape}',
-                '--workspace=8192',
+                '--memPoolSize=workspace:8192M',
                 None if self._args.precision == 'fp32' else f'--{self._args.precision}',
                 # inference options
                 f'--iterations={self._args.iterations}',
@@ -115,6 +142,160 @@ def _preprocess(self):
 
         return True
 
+    def _preprocess_huggingface_models(self):
+        """Preprocess HuggingFace models for TensorRT inference.
+
+        Returns:
+            bool: True if preprocessing succeeds.
+        """
+        import os
+        from transformers import AutoConfig
+
+        if not self._args.model_identifier:
+            logger.error('--model_identifier is required when using --model_source huggingface')
+            return False
+
+        try:
+            # Step 1: Pre-download memory check — download only the config (a few KB)
+            # and estimate whether the full model will fit in GPU memory.
+            hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
+            load_kwargs = {}
+            if hf_token:
+                load_kwargs['token'] = hf_token
+
+            hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs)
+            precision_str = self._args.precision    # already a string: 'fp16', 'fp32', 'int8'
+            fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits(
+                self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token
+            )
+            if not fits:
+                return False
+
+            # Step 2: Download and load the full model
+
+            # Get GPU rank to create unique file paths and avoid race conditions
+            # when multiple processes export the same model simultaneously
+            gpu_rank = os.getenv('CUDA_VISIBLE_DEVICES', '0')
+            proc_rank = os.getenv('PROC_RANK', gpu_rank)
+
+            # Create model source config - load on CPU to avoid accelerate dispatching
+            # model across multiple GPUs which causes device mismatch during ONNX export.
+            # TensorRT handles precision internally via --fp16/--int8 flags,
+            # so the ONNX model is always exported in float32.
+            model_config = ModelSourceConfig(
+                source='huggingface',
+                identifier=self._args.model_identifier,
+                hf_token=hf_token,
+                torch_dtype='float32',
+                device_map=None,
+            )
+
+            logger.info(f'Loading HuggingFace model: {self._args.model_identifier}')
+
+            # Load model from HuggingFace on CPU
+            loader = HuggingFaceModelLoader()
+            hf_model, hf_config, _ = loader.load_model_from_config(model_config, device='cpu')
+            self._hf_config = hf_config
+            exporter = torch2onnxExporter()
+
+            model_name = self._args.model_identifier.replace('/', '_')
+
+            # Prepare output path - use proc_rank subdirectory to avoid race conditions
+            # when multiple processes export the same model simultaneously
+            output_dir = str(Path(torch.hub.get_dir()) / 'checkpoints' / f'trt_rank_{proc_rank}')
+            os.makedirs(output_dir, exist_ok=True)
+
+            onnx_path = exporter.export_huggingface_model(
+                model=hf_model,
+                model_name=model_name,
+                batch_size=self._args.batch_size,
+                seq_length=self._args.seq_length,
+                output_dir=output_dir,
+            )
+
+            if not onnx_path:
+                logger.error(f'Failed to export {self._args.model_identifier} to ONNX')
+                return False
+
+            # Determine input shape based on model type by checking ONNX file
+            import onnx as onnx_lib
+            onnx_model = onnx_lib.load(onnx_path)
+
+            # Filter out initializers from graph.input to get only runtime inputs
+            initializer_names = {init.name for init in onnx_model.graph.initializer}
+            runtime_inputs = [inp for inp in onnx_model.graph.input if inp.name not in initializer_names]
+
+            # Get the first runtime input to determine shape and name
+            input_name = runtime_inputs[0].name
+
+            # Vision models typically have 4D input (batch, channels, height, width)
+            # NLP models typically have 2D input (batch, sequence)
+            if input_name == 'pixel_values' or len(runtime_inputs[0].type.tensor_type.shape.dim) == 4:
+                # Vision model: derive C/H/W from ONNX graph or HF config
+                dims = runtime_inputs[0].type.tensor_type.shape.dim
+                # dims[0] is batch, dims[1:] are C, H, W
+                c_dim = dims[1].dim_value if dims[1].dim_value > 0 else None
+                h_dim = dims[2].dim_value if dims[2].dim_value > 0 else None
+                w_dim = dims[3].dim_value if dims[3].dim_value > 0 else None
+
+                # Fall back to HF config metadata when ONNX dims are dynamic/unknown
+                if hasattr(self, '_hf_config'):
+                    channels = c_dim or getattr(self._hf_config, 'num_channels', 3)
+                    image_size = getattr(self._hf_config, 'image_size', 224)
+                    if isinstance(image_size, (list, tuple)):
+                        height = h_dim or image_size[0]
+                        width = w_dim or image_size[1]
+                    else:
+                        height = h_dim or image_size
+                        width = w_dim or image_size
+                else:
+                    channels = c_dim or 3
+                    height = h_dim or 224
+                    width = w_dim or 224
+
+                input_shapes = f'{input_name}:{self._args.batch_size}x{channels}x{height}x{width}'
+            else:
+                # NLP model: batch x sequence - need to specify all inputs with same batch and seq length
+                seq_len = getattr(self._args, 'seq_length', 512)
+                shapes_list = []
+                for inp in runtime_inputs:
+                    inp_name = inp.name
+                    num_dims = len(inp.type.tensor_type.shape.dim)
+                    if num_dims == 2:
+                        # Standard 2D input: batch x sequence
+                        shapes_list.append(f'{inp_name}:{self._args.batch_size}x{seq_len}')
+                    elif num_dims == 4:
+                        # 4D input (rare for NLP, but handle it)
+                        shapes_list.append(f'{inp_name}:{self._args.batch_size}x1x{seq_len}x{seq_len}')
+                    else:
+                        # Default to 2D
+                        shapes_list.append(f'{inp_name}:{self._args.batch_size}x{seq_len}')
+                input_shapes = ','.join(shapes_list)
+
+            # Build TensorRT command with correct input name
+            args = [
+                self.__bin_path,
+                f'--onnx={onnx_path}',
+                f'--optShapes={input_shapes}',
+                '--memPoolSize=workspace:8192M',
+                None if self._args.precision == 'fp32' else f'--{self._args.precision}',
+                f'--iterations={self._args.iterations}',
+                '--percentile=99',
+            ]
+            self._commands.append(' '.join(filter(None, args)))
+
+            # Store model name for result processing
+            self._args.pytorch_models = [self._args.model_identifier.replace('/', '_')]
+
+            logger.info('Successfully prepared HuggingFace model for TensorRT inference')
+            return True
+
+        except Exception as e:
+            logger.error(f'Failed to prepare HuggingFace model: {str(e)}')
+            import traceback
+            logger.error(traceback.format_exc())
+            return False
+
     def _process_raw_result(self, cmd_idx, raw_output):
         """Function to parse raw results and save the summarized results.
 
diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
new file mode 100644
index 000000000..55c378500
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
@@ -0,0 +1,103 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""End-to-end integration tests for HuggingFace model loading.
+
+These tests actually download and load models from HuggingFace Hub.
+The test class is skipped unless ``SB_TEST_HF_E2E=1`` is set, and
+``test_load_model_to_gpu`` is additionally skipped when
+``torch.cuda.is_available()`` is false.
+"""
+
+import os
+
+import pytest
+import torch
+
+pytest.importorskip('transformers')
+
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+
+
+@pytest.mark.skipif(os.environ.get('SB_TEST_HF_E2E', '0') != '1', reason='Skip HF E2E tests. Set SB_TEST_HF_E2E=1 to enable.')
+class TestHuggingFaceE2E:
+    """End-to-end tests for HuggingFace model loading."""
+    @pytest.fixture
+    def loader(self):
+        """Create a loader instance."""
+        return HuggingFaceModelLoader(cache_dir='/tmp/hf_test_cache')
+
+    def test_load_tiny_bert_model(self, loader):
+        """Test loading a tiny BERT model from HuggingFace Hub.
+
+        Uses prajjwal1/bert-tiny which is a small public BERT model (~17MB).
+        """
+        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+
+        assert model is not None
+        assert config is not None
+        assert config.model_type == 'bert'
+
+        # Verify model can do a forward pass
+        dummy_input = torch.randint(0, 1000, (1, 10))
+        with torch.no_grad():
+            output = model(dummy_input)
+        assert output is not None
+
+    def test_load_distilgpt2_model(self, loader):
+        """Test loading DistilGPT2 model from HuggingFace Hub.
+
+        Uses distilbert/distilgpt2 which is a small public GPT-2 model (~82MB).
+        """
+        model, config, tokenizer = loader.load_model('distilbert/distilgpt2', device='cpu')
+
+        assert model is not None
+        assert config is not None
+        assert config.model_type == 'gpt2'
+
+        # Verify model can do a forward pass
+        dummy_input = torch.randint(0, 1000, (1, 10))
+        with torch.no_grad():
+            output = model(dummy_input)
+        assert output is not None
+
+    def test_load_model_from_config(self, loader):
+        """Test loading model using ModelSourceConfig via load_model_from_config."""
+        config = ModelSourceConfig(source='huggingface', identifier='prajjwal1/bert-tiny', torch_dtype='float32')
+
+        model, hf_config, tokenizer = loader.load_model_from_config(config, device='cpu')
+
+        assert model is not None
+        assert hf_config.model_type == 'bert'
+
+    def test_load_model_with_dtype(self, loader):
+        """Test loading model and converting dtype after load."""
+        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+
+        # Convert to float32 after loading
+        model = model.float()
+
+        # Check model parameters are float32
+        param = next(model.parameters())
+        assert param.dtype == torch.float32
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason='Requires GPU')
+    def test_load_model_to_gpu(self, loader):
+        """Test loading model and moving to GPU."""
+        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+
+        # Move to GPU manually
+        model = model.cuda()
+
+        # Check model is on GPU
+        param = next(model.parameters())
+        assert param.device.type == 'cuda'
+
+    def test_architecture_detection(self, loader):
+        """Test that architecture is correctly detected from loaded model."""
+        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+
+        # Architecture should be detected from config
+        assert config.model_type is not None
+        assert 'bert' in config.model_type.lower()
diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py
new file mode 100644
index 000000000..e679fb068
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py
@@ -0,0 +1,117 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Unit tests for HuggingFaceModelLoader."""
+
+import pytest
+import torch
+from unittest.mock import MagicMock, patch
+
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import (
+    HuggingFaceModelLoader,
+    ModelNotFoundError,
+)
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+
+
+class TestHuggingFaceModelLoader:
+    """Test cases for HuggingFaceModelLoader class."""
+    @pytest.fixture
+    def loader(self, tmp_path):
+        """Create a loader instance for testing."""
+        return HuggingFaceModelLoader(cache_dir=str(tmp_path / 'test_cache'), token=None)
+
+    def test_initialization(self, loader, tmp_path):
+        """Test loader initialization."""
+        assert loader.cache_dir == str(tmp_path / 'test_cache')
+        assert loader.token is None
+
+    def test_initialization_with_env_token(self, monkeypatch, tmp_path):
+        """Test loader picks up token from environment."""
+        monkeypatch.setenv('HF_TOKEN', 'env_token')
+        monkeypatch.setenv('HF_HOME', str(tmp_path / 'hf_cache'))
+        loader = HuggingFaceModelLoader()
+        assert loader.token == 'env_token'
+
+    def test_get_torch_dtype_valid(self, loader):
+        """Test torch dtype conversion."""
+        assert loader._get_torch_dtype('float32') == torch.float32
+        assert loader._get_torch_dtype('float16') == torch.float16
+        assert loader._get_torch_dtype('fp16') == torch.float16
+        assert loader._get_torch_dtype('bfloat16') == torch.bfloat16
+
+    def test_get_torch_dtype_invalid(self, loader):
+        """Test invalid dtype raises error."""
+        with pytest.raises(ValueError, match='Invalid dtype'):
+            loader._get_torch_dtype('invalid_dtype')
+
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoModel')
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoConfig')
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoTokenizer')
+    def test_load_model_success(self, mock_tokenizer, mock_config, mock_model, loader):
+        """Test successful model loading."""
+        # Mock config
+        mock_cfg = MagicMock()
+        mock_cfg.model_type = 'bert'
+        mock_config.from_pretrained.return_value = mock_cfg
+
+        # Mock model
+        mock_mdl = MagicMock()
+        mock_mdl.parameters.return_value = [torch.randn(100, 100)]
+        mock_mdl.to.return_value = mock_mdl
+        mock_model.from_pretrained.return_value = mock_mdl
+
+        # Mock tokenizer
+        mock_tok = MagicMock()
+        mock_tokenizer.from_pretrained.return_value = mock_tok
+
+        model, config, tokenizer = loader.load_model('test/model', device='cpu')
+
+        assert model == mock_mdl
+        assert config == mock_cfg
+        assert tokenizer == mock_tok
+
+        # Verify mocks were called with correct arguments
+        mock_config.from_pretrained.assert_called_once()
+        call_kwargs = mock_config.from_pretrained.call_args
+        assert call_kwargs[0][0] == 'test/model'
+        assert call_kwargs[1]['trust_remote_code'] is True
+        assert call_kwargs[1]['cache_dir'] == loader.cache_dir
+
+        mock_model.from_pretrained.assert_called_once()
+        model_call_kwargs = mock_model.from_pretrained.call_args
+        assert model_call_kwargs[1]['trust_remote_code'] is True
+        assert model_call_kwargs[1]['cache_dir'] == loader.cache_dir
+
+        mock_tokenizer.from_pretrained.assert_called_once()
+
+        # Verify model was moved to the requested device
+        mock_mdl.to.assert_called_once_with('cpu')
+
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoTokenizer')
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoModel')
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoConfig')
+    def test_load_model_not_found(self, mock_config, mock_model, mock_tokenizer, loader):
+        """Test loading non-existent model."""
+        mock_config.from_pretrained.side_effect = OSError('404 Client Error')
+
+        with pytest.raises(ModelNotFoundError, match='not found'):
+            loader.load_model('nonexistent/model')
+
+    def test_load_model_from_config_invalid_source(self, loader):
+        """Test loading with invalid source in config."""
+        config = ModelSourceConfig(source='in-house', identifier='bert-base')
+
+        with pytest.raises(ValueError, match='Cannot load model'):
+            loader.load_model_from_config(config)
+
+    def test_get_model_size(self, loader):
+        """Test model size calculation."""
+        mock_model = MagicMock()
+        mock_model.parameters.return_value = [
+            torch.randn(1000, 1000),    # 1M params
+            torch.randn(500, 500),    # 0.25M params
+        ]
+
+        size = loader._get_model_size(mock_model)
+        assert abs(size - 1.25) < 0.01    # Should be ~1.25M
diff --git a/tests/benchmarks/micro_benchmarks/test_model_source_config.py b/tests/benchmarks/micro_benchmarks/test_model_source_config.py
new file mode 100644
index 000000000..9d9f7f35e
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_model_source_config.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Unit tests for ModelSourceConfig."""
+
+import pytest
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+
+
+class TestModelSourceConfig:
+    """Test cases for ModelSourceConfig class."""
+    def test_default_config(self):
+        """Test default configuration."""
+        config = ModelSourceConfig(identifier='bert-base')
+        assert config.source == 'in-house'
+        assert config.identifier == 'bert-base'
+        assert config.torch_dtype == 'float32'
+        assert config.hf_token is None
+
+    def test_huggingface_config(self):
+        """Test HuggingFace configuration."""
+        config = ModelSourceConfig(source='huggingface', identifier='meta-llama/Llama-2-7b-hf', torch_dtype='float16')
+        assert config.source == 'huggingface'
+        assert config.identifier == 'meta-llama/Llama-2-7b-hf'
+        assert config.torch_dtype == 'float16'
+
+    def test_invalid_source(self):
+        """Test invalid source raises error."""
+        with pytest.raises(ValueError, match='Invalid model source'):
+            ModelSourceConfig(source='invalid', identifier='test')
+
+    def test_invalid_dtype(self):
+        """Test invalid dtype raises error."""
+        with pytest.raises(ValueError, match='Invalid torch_dtype'):
+            ModelSourceConfig(identifier='test', torch_dtype='invalid')
+
+    def test_missing_identifier(self):
+        """Test missing identifier raises error."""
+        with pytest.raises(ValueError, match='identifier must be provided'):
+            ModelSourceConfig(identifier='')
+
+    def test_validate_huggingface_empty(self):
+        """Test validation of empty HuggingFace model identifier."""
+        config = ModelSourceConfig(source='huggingface', identifier='   ')
+        is_valid, message = config.validate()
+        assert not is_valid
+        assert 'cannot be empty' in message
+
+    def test_validate_valid_huggingface(self):
+        """Test validation of valid HuggingFace model."""
+        config = ModelSourceConfig(source='huggingface', identifier='meta-llama/Llama-2-7b-hf')
+        is_valid, message = config.validate()
+        assert is_valid
+        assert message == ''
+
+    def test_validate_valid_huggingface_short_name(self):
+        """Test validation of valid HuggingFace model with short name (no org)."""
+        config = ModelSourceConfig(source='huggingface', identifier='bert-base-uncased')
+        is_valid, message = config.validate()
+        assert is_valid
+        assert message == ''
+
+    def test_is_huggingface(self):
+        """Test is_huggingface method."""
+        hf_config = ModelSourceConfig(source='huggingface', identifier='test/model')
+        inhouse_config = ModelSourceConfig(source='in-house', identifier='bert-base')
+        assert hf_config.is_huggingface() is True
+        assert inhouse_config.is_huggingface() is False
+
+    def test_deprecated_use_auth_token(self):
+        """Test deprecated use_auth_token parameter."""
+        config = ModelSourceConfig(identifier='test', use_auth_token='old_token')
+        assert config.hf_token == 'old_token'
diff --git a/tests/helper/decorator.py b/tests/helper/decorator.py
index ff08469ac..8d0ad314b 100644
--- a/tests/helper/decorator.py
+++ b/tests/helper/decorator.py
@@ -13,6 +13,7 @@
 
 pytorch_test = unittest.skipIf(os.environ.get('SB_TEST_PYTORCH', '1') == '0', 'Skip PyTorch tests.')
 directx_test = unittest.skipIf(os.environ.get('SB_TEST_DIRECTX', '0') == '0', 'Skip DirectX tests.')
+hf_e2e_test = unittest.skipUnless(os.environ.get('SB_TEST_HF_E2E', '0') == '1', 'Skip HF E2E tests. Set SB_TEST_HF_E2E=1 to enable.')
 
 
 def load_data(filepath):

From 6139332094fc378fa6f598646b2cc5ea7df6b8ee Mon Sep 17 00:00:00 2001
From: root
 <root@GB30002.kvvnxsngzpxejneyr54yfhf04a.cbnx.internal.cloudapp.net>
Date: Fri, 29 May 2026 21:23:06 +0000
Subject: [PATCH 02/12] fixing PR comments

---
 .../huggingface_model_loader.py               | 104 ++++++++++++++---
 .../micro_benchmarks/model_source_config.py   |   6 +-
 .../ort_inference_performance.py              |  99 +++++++++++++---
 .../tensorrt_inference_performance.py         | 106 ++++++++++++++++--
 .../micro_benchmarks/test_huggingface_e2e.py  |  11 +-
 .../test_huggingface_loader.py                |   8 +-
 tests/helper/decorator.py                     |   1 -
 7 files changed, 287 insertions(+), 48 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
index 9d8c55359..c7f28a2eb 100644
--- a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
+++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
@@ -4,6 +4,7 @@
 """Hugging Face model loader for benchmarking."""
 
 import os
+import re
 from pathlib import Path
 from typing import Optional, Tuple
 
@@ -20,6 +21,39 @@
 from superbench.common.utils import logger
 from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
 
+# Strict allow-list for HuggingFace model identifiers. Accepts either a bare
+# repo name ('bert-base-uncased') or 'namespace/name' form, restricted to the
+# character set HF itself uses and bounded in length. Rejects '..', backslash,
+# colon, control chars, absolute paths, and anything that could be interpreted
+# as a local filesystem path by AutoConfig.from_pretrained (which silently
+# loads from disk when given a path that exists).
+_SAFE_MODEL_ID_RE = re.compile(r'^[A-Za-z0-9][A-Za-z0-9._-]{0,127}(/[A-Za-z0-9._-]{1,128})?$')
+
+
+def validate_model_identifier(model_identifier: Optional[str]) -> str:
+    """Validate a HuggingFace model identifier against a strict allow-list.
+
+    Args:
+        model_identifier: The identifier to validate (typically from CLI input).
+
+    Returns:
+        The validated identifier (unchanged) for convenient inline use.
+
+    Raises:
+        ValueError: If the identifier is missing or does not match the
+            permitted ``[namespace/]name`` shape. The check intentionally
+            rejects path-traversal sequences and characters that could let
+            ``from_pretrained`` load attacker-staged files from disk.
+    """
+    if not model_identifier or not _SAFE_MODEL_ID_RE.match(model_identifier):
+        raise ValueError(
+            f'Invalid model_identifier {model_identifier!r}. '
+            'Must be a HuggingFace repo id matching '
+            "'^[A-Za-z0-9][A-Za-z0-9._-]{0,127}(/[A-Za-z0-9._-]{1,128})?$' "
+            '(e.g. "bert-base-uncased" or "meta-llama/Llama-2-7b-hf").'
+        )
+    return model_identifier
+
 
 class ModelLoadError(Exception):
     """Exception raised when model loading fails."""
@@ -46,16 +80,29 @@ class HuggingFaceModelLoader:
     Attributes:
         cache_dir: Directory to cache downloaded models.
         token: HuggingFace authentication token for private/gated models.
+        allow_remote_code: Whether to allow HuggingFace to download and execute
+            repository-provided Python (``trust_remote_code=True``). Default
+            ``False``; enabling this turns ``--model_identifier`` into an RCE
+            sink, so it is opt-in only.
     """
-    def __init__(self, cache_dir: Optional[str] = None, token: Optional[str] = None):
+    def __init__(
+        self,
+        cache_dir: Optional[str] = None,
+        token: Optional[str] = None,
+        allow_remote_code: bool = False,
+    ):
         """Initialize the HuggingFace model loader.
 
         Args:
             cache_dir: Directory to cache downloaded models. If None, uses HF default.
             token: HuggingFace authentication token for private/gated models.
+            allow_remote_code: If True, allow execution of model-repo Python via
+                ``trust_remote_code=True``. Default False. Only enable for
+                trusted ``--model_identifier`` values; pin ``--revision <sha>``.
         """
         self.cache_dir = cache_dir or os.getenv('HF_HOME') or os.path.expanduser('~/.cache/huggingface')
         self.token = token or os.getenv('HF_TOKEN') or os.getenv('HUGGING_FACE_HUB_TOKEN')
+        self.allow_remote_code = bool(allow_remote_code)
 
         # Ensure cache directory exists
         Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
@@ -63,6 +110,11 @@ def __init__(self, cache_dir: Optional[str] = None, token: Optional[str] = None)
         logger.info(f'HuggingFaceModelLoader initialized with cache_dir: {self.cache_dir}')
         if self.token:
             logger.info('Authentication token provided for private/gated models (token not logged)')
+        if self.allow_remote_code:
+            logger.warning(
+                'allow_remote_code=True: HuggingFace may download and execute arbitrary Python '
+                'from model repositories. Only enable for trusted model identifiers; pin --revision.'
+            )
 
     def load_model(
         self,
@@ -94,6 +146,9 @@ def load_model(
         """
         logger.info(f'Loading model: {model_identifier}')
 
+        # Reject malformed / path-like identifiers before any network or disk activity.
+        validate_model_identifier(model_identifier)
+
         try:
             # Convert torch_dtype string to torch dtype
             dtype = self._get_torch_dtype(torch_dtype) if torch_dtype else None
@@ -112,7 +167,9 @@ def load_model(
             # Load config (use pre-downloaded config if provided)
             if config is None:
                 logger.info('Loading model configuration...')
-                config = AutoConfig.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs)
+                config = AutoConfig.from_pretrained(
+                    model_identifier, trust_remote_code=self.allow_remote_code, **load_kwargs
+                )
             else:
                 logger.info('Using pre-downloaded model configuration.')
 
@@ -120,14 +177,16 @@ def load_model(
             tokenizer = None
             try:
                 logger.info('Loading tokenizer...')
-                tokenizer = AutoTokenizer.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs)
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_identifier, trust_remote_code=self.allow_remote_code, **load_kwargs
+                )
             except Exception as e:
                 logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.')
 
             # Load model
             logger.info(f'Loading model weights (dtype={torch_dtype}, device={device})...')
             model_kwargs = load_kwargs.copy()
-            model_kwargs['trust_remote_code'] = True
+            model_kwargs['trust_remote_code'] = self.allow_remote_code
 
             # Handle device mapping for large models
             effective_device_map = device_map
@@ -202,16 +261,31 @@ def load_model_from_config(
         if device is None:
             device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
-        # Extract loading parameters
-        return self.load_model(
-            model_identifier=config.identifier,
-            torch_dtype=config.torch_dtype,
-            device=device,
-            revision=config.revision,
-            device_map=config.device_map,
-            config=config_pretrained,
-            **config.additional_kwargs
-        )
+        # Honor explicit per-call hf_token / cache_dir from the config without permanently
+        # mutating the loader instance. This makes ModelSourceConfig the single source of
+        # truth for callers that don't rely on HF_TOKEN / HF_HOME env vars.
+        original_token = self.token
+        original_cache_dir = self.cache_dir
+        try:
+            if config.hf_token:
+                self.token = config.hf_token
+            if config.cache_dir:
+                self.cache_dir = config.cache_dir
+                Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
+
+            # Extract loading parameters
+            return self.load_model(
+                model_identifier=config.identifier,
+                torch_dtype=config.torch_dtype,
+                device=device,
+                revision=config.revision,
+                device_map=config.device_map,
+                config=config_pretrained,
+                **config.additional_kwargs
+            )
+        finally:
+            self.token = original_token
+            self.cache_dir = original_cache_dir
 
     def _get_torch_dtype(self, dtype_str: str) -> torch.dtype:
         """Convert dtype string to torch.dtype.
@@ -242,7 +316,7 @@ def _get_torch_dtype(self, dtype_str: str) -> torch.dtype:
         }
 
         if normalized_dtype not in dtype_map:
-            raise ValueError(f"Invalid dtype '{dtype_str}'.Must be one of {list(dtype_map.keys())}")
+            raise ValueError(f"Invalid dtype '{dtype_str}'. Must be one of {list(dtype_map.keys())}")
 
         return dtype_map[normalized_dtype]
 
diff --git a/superbench/benchmarks/micro_benchmarks/model_source_config.py b/superbench/benchmarks/micro_benchmarks/model_source_config.py
index 99ca31870..b141e6a21 100644
--- a/superbench/benchmarks/micro_benchmarks/model_source_config.py
+++ b/superbench/benchmarks/micro_benchmarks/model_source_config.py
@@ -47,8 +47,10 @@ def __post_init__(self):
         if self.source not in ['in-house', 'huggingface']:
             raise ValueError(f"Invalid model source '{self.source}'. Must be 'in-house' or 'huggingface'.")
 
-        # Validate torch_dtype
-        valid_dtypes = ['float32', 'float16', 'bfloat16', 'int8']
+        # Validate torch_dtype. NOTE: 'int8' is intentionally excluded here — it is handled
+        # post-export via quantize_dynamic (see ort_inference_performance.py) rather than via
+        # the HF torch_dtype loading path, which does not accept torch.int8.
+        valid_dtypes = ['float32', 'float16', 'bfloat16']
         if self.torch_dtype not in valid_dtypes:
             raise ValueError(f"Invalid torch_dtype '{self.torch_dtype}'. Must be one of {valid_dtypes}.")
 
diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
index 2e0fff826..7e74e62c1 100644
--- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
@@ -12,10 +12,13 @@
 import numpy as np
 
 from superbench.common.utils import logger
-from superbench.benchmarks import BenchmarkRegistry, Platform, Precision
+from superbench.benchmarks import BenchmarkRegistry, Platform, Precision, ReturnCode
 from superbench.benchmarks.micro_benchmarks import MicroBenchmark
 from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
-from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import (
+    HuggingFaceModelLoader,
+    validate_model_identifier,
+)
 
 
 class ORTInferenceBenchmark(MicroBenchmark):
@@ -42,6 +45,9 @@ def __init__(self, name, parameters=''):
         ]
         self.__graph_opt_level = None
         self.__model_cache_path = Path(torch.hub.get_dir()) / 'checkpoints'
+        # Stashed HF config (populated in _preprocess_huggingface_models) so that
+        # __inference() can derive vocab_size / dynamic input shapes from it.
+        self._hf_config = None
 
     def add_parser_arguments(self):
         """Add the specified arguments."""
@@ -124,6 +130,24 @@ def add_parser_arguments(self):
             help='Sequence length for transformer models.',
         )
 
+        self._parser.add_argument(
+            '--require_cuda',
+            action='store_true',
+            default=False,
+            required=False,
+            help='Fail if CUDAExecutionProvider is not available. '
+            'Default: warn and fall back to other registered ORT providers (CPU/ROCm/etc.).',
+        )
+
+        self._parser.add_argument(
+            '--allow_remote_code',
+            action='store_true',
+            default=False,
+            required=False,
+            help='Allow HuggingFace to execute model-repo Python (trust_remote_code=True). '
+            'SECURITY: enables RCE from --model_identifier. Pin --revision <sha> when used.',
+        )
+
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -179,8 +203,19 @@ def _preprocess_huggingface_models(self):
 
         if not self._args.model_identifier:
             logger.error('--model_identifier is required when using --model_source huggingface')
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
             return False
 
+        # Reject malformed / path-like identifiers up front, before any network or disk activity.
+        try:
+            validate_model_identifier(self._args.model_identifier)
+        except ValueError as e:
+            logger.error(str(e))
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
+            return False
+
+        allow_remote_code = bool(getattr(self._args, 'allow_remote_code', False))
+
         try:
             logger.info(f'Loading HuggingFace model: {self._args.model_identifier}')
 
@@ -190,13 +225,18 @@ def _preprocess_huggingface_models(self):
             load_kwargs = {}
             if hf_token:
                 load_kwargs['token'] = hf_token
-            hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs)
+            hf_config = AutoConfig.from_pretrained(
+                self._args.model_identifier, trust_remote_code=allow_remote_code, **load_kwargs
+            )
+            # Stash for __inference() to read vocab_size / other model metadata later.
+            self._hf_config = hf_config
 
             precision_str = self._args.precision.value if self._args.precision != Precision.INT8 else 'float32'
             fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits(
                 self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token
             )
             if not fits:
+                self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
                 return False
 
             # Step 2: Proceed with model download and ONNX export
@@ -217,7 +257,7 @@ def _preprocess_huggingface_models(self):
             )
 
             # Load model from HuggingFace on CPU
-            loader = HuggingFaceModelLoader()
+            loader = HuggingFaceModelLoader(allow_remote_code=allow_remote_code)
             hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu')
             from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter
             exporter = torch2onnxExporter()
@@ -237,6 +277,15 @@ def _preprocess_huggingface_models(self):
                 export_precision = self._args.precision.value
             model_name_with_precision = f'{model_name}.{export_precision}'
 
+            # Defense-in-depth: confirm the resolved output path stays inside the rank
+            # directory even though validate_model_identifier already rejected '..' / '\\'.
+            proc_root = proc_output_path.resolve()
+            resolved_out = (proc_output_path / f'{model_name_with_precision}.onnx').resolve()
+            if proc_root not in resolved_out.parents:
+                logger.error(f'Refusing to write ONNX outside rank dir: {resolved_out} not under {proc_root}')
+                self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
+                return False
+
             # Export directly to final destination to avoid path issues with external data
             onnx_path = exporter.export_huggingface_model(
                 model=hf_model,
@@ -248,6 +297,7 @@ def _preprocess_huggingface_models(self):
 
             if not onnx_path:
                 logger.error(f'Failed to export {self._args.model_identifier} to ONNX')
+                self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
                 return False
 
             # Apply INT8 quantization if requested (matching in-house model behavior)
@@ -268,6 +318,7 @@ def _preprocess_huggingface_models(self):
             logger.error(f'Failed to prepare HuggingFace model: {str(e)}')
             import traceback
             logger.error(traceback.format_exc())
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
             return False
 
     def _benchmark(self):
@@ -275,18 +326,24 @@ def _benchmark(self):
         import onnxruntime as ort
         precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8'}
 
-        # Require CUDAExecutionProvider — this benchmark targets GPU inference
         available = ort.get_available_providers()
-        if 'CUDAExecutionProvider' not in available:
-            logger.error(f'CUDAExecutionProvider is not available (available: {available}).')
-            return False
+        cuda_available = 'CUDAExecutionProvider' in available
+        if not cuda_available:
+            msg = f'CUDAExecutionProvider is not available (available providers: {available}).'
+            if getattr(self._args, 'require_cuda', False):
+                logger.error(msg + ' --require_cuda was set, aborting.')
+                return False
+            logger.warning(
+                msg + ' Falling back to registered providers; pass --require_cuda to fail instead.'
+            )
+        providers = ['CUDAExecutionProvider'] if cuda_available else available
 
         for model in self._args.pytorch_models:
             sess_options = ort.SessionOptions()
             sess_options.graph_optimization_level = self.__graph_opt_level[self._args.graph_opt_level]
             file_name = '{model}.{precision}.onnx'.format(model=model, precision=self._args.precision)
             ort_sess = ort.InferenceSession(
-                f'{self.__model_cache_path / file_name}', sess_options, providers=['CUDAExecutionProvider']
+                f'{self.__model_cache_path / file_name}', sess_options, providers=providers
             )
 
             elapse_times = self.__inference(ort_sess)
@@ -318,18 +375,30 @@ def __inference(self, ort_sess):
         """
         precision = np.float16 if self._args.precision == Precision.FLOAT16 else np.float32
 
-        # Get input names from the ONNX session to determine input format
-        input_names = [input.name for input in ort_sess.get_inputs()]
+        # Get input metadata from the ONNX session to determine input format and shapes
+        ort_inputs = ort_sess.get_inputs()
+        input_names = [inp.name for inp in ort_inputs]
 
         # Determine input format based on what the model expects
         if 'pixel_values' in input_names:
-            # Vision model: use pixel_values (batch_size, 3, 224, 224)
-            pixel_values = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision)
+            # Vision model: derive (C, H, W) from the exported ONNX graph so that models
+            # with non-default shapes (e.g. 384x384 ViT, 1-channel medical models) work.
+            # Fall back to (3, 224, 224) only for dynamic / unknown axes.
+            meta = next(inp for inp in ort_inputs if inp.name == 'pixel_values')
+            dims = [d if isinstance(d, int) else None for d in (meta.shape or [])]
+            # Expected layout is (N, C, H, W); pad to length 4 if shorter.
+            dims = (dims + [None] * 4)[:4]
+            _, c, h, w = dims
+            c, h, w = c or 3, h or 224, w or 224
+            pixel_values = np.random.randn(self._args.batch_size, c, h, w).astype(dtype=precision)
             inputs = {'pixel_values': pixel_values}
         elif 'input_ids' in input_names:
-            # NLP model: use input_ids and attention_mask
+            # NLP model: use input_ids and attention_mask. Cap token IDs at the model's
+            # actual vocab_size to avoid out-of-range embedding lookups (undefined behavior
+            # on CUDA — silent NaNs / device-side asserts).
             seq_len = getattr(self._args, 'seq_length', 512)
-            input_ids = np.random.randint(0, 30000, (self._args.batch_size, seq_len)).astype(np.int64)
+            vocab_size = getattr(self._hf_config, 'vocab_size', None) or 30000
+            input_ids = np.random.randint(0, vocab_size, (self._args.batch_size, seq_len)).astype(np.int64)
             attention_mask = np.ones((self._args.batch_size, seq_len), dtype=np.int64)
             inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
         else:
diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
index 5153073a3..737358aaa 100644
--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -5,6 +5,7 @@
 
 import os
 import re
+import subprocess
 from pathlib import Path
 
 import torch
@@ -14,7 +15,10 @@
 from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
 from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter
 from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
-from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import (
+    HuggingFaceModelLoader,
+    validate_model_identifier,
+)
 
 
 class TensorRTInferenceBenchmark(MicroBenchmarkWithInvoke):
@@ -94,6 +98,47 @@ def add_parser_arguments(self):
             help='Model identifier for HuggingFace models (e.g., bert-base-uncased).',
         )
 
+        self._parser.add_argument(
+            '--allow_remote_code',
+            action='store_true',
+            default=False,
+            required=False,
+            help='Allow HuggingFace to execute model-repo Python (trust_remote_code=True). '
+            'SECURITY: enables RCE from --model_identifier. Pin --revision <sha> when used.',
+        )
+
+    @staticmethod
+    def __detect_workspace_flag(bin_path: str) -> str:
+        """Return the trtexec workspace flag supported by the installed binary.
+
+        Args:
+            bin_path: Absolute path to the trtexec binary.
+
+        Returns:
+            ``'--memPoolSize=workspace:8192M'`` on TensorRT >= 8.4,
+            ``'--workspace=8192'`` on older runtimes or when probing fails.
+        """
+        modern = '--memPoolSize=workspace:8192M'
+        legacy = '--workspace=8192'
+        try:
+            proc = subprocess.run(
+                [bin_path, '--help'], capture_output=True, text=True, timeout=10, check=False
+            )
+            help_text = (proc.stdout or '') + (proc.stderr or '')
+            if '--memPoolSize' in help_text:
+                return modern
+            logger.warning(
+                'trtexec at %s does not advertise --memPoolSize; falling back to --workspace=8192 '
+                '(TensorRT < 8.4 detected).', bin_path
+            )
+            return legacy
+        except (OSError, subprocess.SubprocessError) as e:
+            logger.warning(
+                'Could not probe trtexec at %s for --memPoolSize support (%s); using --workspace=8192.',
+                bin_path, e,
+            )
+            return legacy
+
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -104,6 +149,11 @@ def _preprocess(self):
             return False
 
         self.__bin_path = str(Path(self._args.bin_dir) / self._bin_name)
+        # Pick the right workspace flag for the installed trtexec. --memPoolSize was
+        # introduced in TensorRT 8.4; older runtimes (TRT 8.0-8.3, still found in
+        # some CUDA 11.x base images) only accept the deprecated-but-still-supported
+        # --workspace=. Probe once here and reuse for every model.
+        self.__workspace_flag = self.__detect_workspace_flag(self.__bin_path)
 
         # Handle HuggingFace models if specified
         if self._args.model_source == 'huggingface':
@@ -131,7 +181,7 @@ def _preprocess(self):
                 f'--onnx={onnx_model}',
                 # build options
                 f'--optShapes=input:{input_shape}',
-                '--memPoolSize=workspace:8192M',
+                self.__workspace_flag,
                 None if self._args.precision == 'fp32' else f'--{self._args.precision}',
                 # inference options
                 f'--iterations={self._args.iterations}',
@@ -148,11 +198,34 @@ def _preprocess_huggingface_models(self):
         Returns:
             bool: True if preprocessing succeeds.
         """
-        import os
         from transformers import AutoConfig
 
         if not self._args.model_identifier:
             logger.error('--model_identifier is required when using --model_source huggingface')
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
+            return False
+
+        # Reject malformed / path-like identifiers up front, before any network or disk activity.
+        try:
+            validate_model_identifier(self._args.model_identifier)
+        except ValueError as e:
+            logger.error(str(e))
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
+            return False
+
+        allow_remote_code = bool(getattr(self._args, 'allow_remote_code', False))
+
+        # Reject INT8 on the HuggingFace path: the current pipeline emits `--int8` to
+        # trtexec without `--calib=<file>` and without a Q/DQ-embedded ONNX, so trtexec
+        # would fall back to fake dynamic ranges and report misleading latencies.
+        if str(getattr(self._args, 'precision', '')).lower() == 'int8':
+            logger.error(
+                'TensorRT --precision int8 on HuggingFace models is not supported: '
+                'no calibration data / Q-DQ ONNX is generated, so reported latencies '
+                'would not represent a correctly-calibrated INT8 engine. '
+                'Use --precision fp16 or fp32, or run ORT INT8 quantization first.'
+            )
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
             return False
 
         try:
@@ -163,12 +236,15 @@ def _preprocess_huggingface_models(self):
             if hf_token:
                 load_kwargs['token'] = hf_token
 
-            hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs)
+            hf_config = AutoConfig.from_pretrained(
+                self._args.model_identifier, trust_remote_code=allow_remote_code, **load_kwargs
+            )
             precision_str = self._args.precision    # already a string: 'fp16', 'fp32', 'int8'
             fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits(
                 self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token
             )
             if not fits:
+                self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
                 return False
 
             # Step 2: Download and load the full model
@@ -193,7 +269,7 @@ def _preprocess_huggingface_models(self):
             logger.info(f'Loading HuggingFace model: {self._args.model_identifier}')
 
             # Load model from HuggingFace on CPU
-            loader = HuggingFaceModelLoader()
+            loader = HuggingFaceModelLoader(allow_remote_code=allow_remote_code)
             hf_model, hf_config, _ = loader.load_model_from_config(model_config, device='cpu')
             self._hf_config = hf_config
             exporter = torch2onnxExporter()
@@ -205,6 +281,15 @@ def _preprocess_huggingface_models(self):
             output_dir = str(Path(torch.hub.get_dir()) / 'checkpoints' / f'trt_rank_{proc_rank}')
             os.makedirs(output_dir, exist_ok=True)
 
+            # Defense-in-depth: confirm resolved output path stays inside the rank directory
+            # even though validate_model_identifier already rejected '..' / '\\' / control chars.
+            proc_root = Path(output_dir).resolve()
+            resolved_out = (Path(output_dir) / f'{model_name}.onnx').resolve()
+            if proc_root not in resolved_out.parents:
+                logger.error(f'Refusing to write ONNX outside rank dir: {resolved_out} not under {proc_root}')
+                self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
+                return False
+
             onnx_path = exporter.export_huggingface_model(
                 model=hf_model,
                 model_name=model_name,
@@ -215,11 +300,15 @@ def _preprocess_huggingface_models(self):
 
             if not onnx_path:
                 logger.error(f'Failed to export {self._args.model_identifier} to ONNX')
+                self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
                 return False
 
-            # Determine input shape based on model type by checking ONNX file
+            # Determine input shape based on model type by checking ONNX file.
+            # Pass load_external_data=False because we only need graph input metadata;
+            # the default True would materialize all sidecar tensors and OOM on the
+            # >2GB external-data models that this branch was written for.
             import onnx as onnx_lib
-            onnx_model = onnx_lib.load(onnx_path)
+            onnx_model = onnx_lib.load(onnx_path, load_external_data=False)
 
             # Filter out initializers from graph.input to get only runtime inputs
             initializer_names = {init.name for init in onnx_model.graph.initializer}
@@ -277,7 +366,7 @@ def _preprocess_huggingface_models(self):
                 self.__bin_path,
                 f'--onnx={onnx_path}',
                 f'--optShapes={input_shapes}',
-                '--memPoolSize=workspace:8192M',
+                self.__workspace_flag,
                 None if self._args.precision == 'fp32' else f'--{self._args.precision}',
                 f'--iterations={self._args.iterations}',
                 '--percentile=99',
@@ -294,6 +383,7 @@ def _preprocess_huggingface_models(self):
             logger.error(f'Failed to prepare HuggingFace model: {str(e)}')
             import traceback
             logger.error(traceback.format_exc())
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
             return False
 
     def _process_raw_result(self, cmd_idx, raw_output):
diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
index 55c378500..22061bf7c 100644
--- a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
+++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
@@ -20,13 +20,16 @@
 from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
 
 
-@pytest.mark.skipif(os.environ.get('SB_TEST_HF_E2E', '0') != '1', reason='Skip HF E2E tests. Set SB_TEST_HF_E2E=1 to enable.')
+@pytest.mark.skipif(
+    os.environ.get('SB_TEST_HF_E2E', '0') != '1',
+    reason='Skip HF E2E tests. Set SB_TEST_HF_E2E=1 to enable.',
+)
 class TestHuggingFaceE2E:
     """End-to-end tests for HuggingFace model loading."""
     @pytest.fixture
-    def loader(self):
-        """Create a loader instance."""
-        return HuggingFaceModelLoader(cache_dir='/tmp/hf_test_cache')
+    def loader(self, tmp_path):
+        """Create a loader instance with an isolated per-test cache dir."""
+        return HuggingFaceModelLoader(cache_dir=str(tmp_path / 'hf_cache'))
 
     def test_load_tiny_bert_model(self, loader):
         """Test loading a tiny BERT model from HuggingFace Hub.
diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py
index e679fb068..0d32c9ea4 100644
--- a/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py
+++ b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py
@@ -71,16 +71,18 @@ def test_load_model_success(self, mock_tokenizer, mock_config, mock_model, loade
         assert config == mock_cfg
         assert tokenizer == mock_tok
 
-        # Verify mocks were called with correct arguments
+        # Verify mocks were called with correct arguments. trust_remote_code must
+        # default to False (matches loader.allow_remote_code=False) so that arbitrary
+        # repo Python is not executed unless the caller explicitly opts in.
         mock_config.from_pretrained.assert_called_once()
         call_kwargs = mock_config.from_pretrained.call_args
         assert call_kwargs[0][0] == 'test/model'
-        assert call_kwargs[1]['trust_remote_code'] is True
+        assert call_kwargs[1]['trust_remote_code'] is False
         assert call_kwargs[1]['cache_dir'] == loader.cache_dir
 
         mock_model.from_pretrained.assert_called_once()
         model_call_kwargs = mock_model.from_pretrained.call_args
-        assert model_call_kwargs[1]['trust_remote_code'] is True
+        assert model_call_kwargs[1]['trust_remote_code'] is False
         assert model_call_kwargs[1]['cache_dir'] == loader.cache_dir
 
         mock_tokenizer.from_pretrained.assert_called_once()
diff --git a/tests/helper/decorator.py b/tests/helper/decorator.py
index 8d0ad314b..ff08469ac 100644
--- a/tests/helper/decorator.py
+++ b/tests/helper/decorator.py
@@ -13,7 +13,6 @@
 
 pytorch_test = unittest.skipIf(os.environ.get('SB_TEST_PYTORCH', '1') == '0', 'Skip PyTorch tests.')
 directx_test = unittest.skipIf(os.environ.get('SB_TEST_DIRECTX', '0') == '0', 'Skip DirectX tests.')
-hf_e2e_test = unittest.skipUnless(os.environ.get('SB_TEST_HF_E2E', '0') == '1', 'Skip HF E2E tests. Set SB_TEST_HF_E2E=1 to enable.')
 
 
 def load_data(filepath):

From 18f13ef4b9b8f05644b5ee79ebe8b80b325766bf Mon Sep 17 00:00:00 2001
From: Aishwarya Tonpe <aishwarya.tonpe25@gmail.com>
Date: Tue, 2 Jun 2026 10:29:13 -0700
Subject: [PATCH 03/12] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 .../micro_benchmarks/tensorrt_inference_performance.py          | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
index b00791aec..717d7aa0d 100644
--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -358,6 +358,8 @@ def _derive_trt_input_shapes(self, onnx_path):
         # Filter out initializers from graph.input to get only runtime inputs
         initializer_names = {init.name for init in onnx_model.graph.initializer}
         runtime_inputs = [inp for inp in onnx_model.graph.input if inp.name not in initializer_names]
+        if not runtime_inputs:
+            raise ValueError(f'No runtime inputs found in exported ONNX model: {onnx_path}')
 
         # Get the first runtime input to determine shape and name
         input_name = runtime_inputs[0].name

From 83a533ff6a73b6fbfcc4573a4e2e7512ae1acc80 Mon Sep 17 00:00:00 2001
From: Aishwarya Tonpe <aishwarya.tonpe25@gmail.com>
Date: Tue, 2 Jun 2026 10:29:51 -0700
Subject: [PATCH 04/12] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 .../micro_benchmarks/tensorrt_inference_performance.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
index 717d7aa0d..06d1dd6e0 100644
--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -239,7 +239,7 @@ def _preprocess_huggingface_models(self):
                 self._args.model_identifier, trust_remote_code=allow_remote_code, **load_kwargs
             )
             precision_str = self._args.precision    # already a string: 'fp16', 'fp32', 'int8'
-            fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits(
+            fits, _, _, _ = HuggingFaceModelLoader.check_memory_fits(
                 self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token
             )
             if not fits:

From 44da2e13402c521eaacb88a550a23a3967c666ec Mon Sep 17 00:00:00 2001
From: Aishwarya Tonpe <aishwarya.tonpe25@gmail.com>
Date: Tue, 2 Jun 2026 10:30:31 -0700
Subject: [PATCH 05/12] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 .../benchmarks/micro_benchmarks/ort_inference_performance.py     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
index 8cbf269df..644d964f8 100644
--- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
@@ -345,6 +345,7 @@ def _benchmark(self):
             msg = f'CUDAExecutionProvider is not available (available providers: {available}).'
             if getattr(self._args, 'require_cuda', False):
                 logger.error(msg + ' --require_cuda was set, aborting.')
+                self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
                 return False
             logger.warning(msg + ' Falling back to registered providers; pass --require_cuda to fail instead.')
         providers = ['CUDAExecutionProvider'] if cuda_available else available

From 864a8e9cdf8e18bccdff0cd7143f4f09a4edb6ce Mon Sep 17 00:00:00 2001
From: Aishwarya Tonpe <aishwarya.tonpe25@gmail.com>
Date: Tue, 2 Jun 2026 10:30:59 -0700
Subject: [PATCH 06/12] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 tests/benchmarks/micro_benchmarks/test_model_source_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmarks/micro_benchmarks/test_model_source_config.py b/tests/benchmarks/micro_benchmarks/test_model_source_config.py
index 9d9f7f35e..81cb7d0fa 100644
--- a/tests/benchmarks/micro_benchmarks/test_model_source_config.py
+++ b/tests/benchmarks/micro_benchmarks/test_model_source_config.py
@@ -36,7 +36,7 @@ def test_invalid_dtype(self):
 
     def test_missing_identifier(self):
         """Test missing identifier raises error."""
-        with pytest.raises(ValueError, match='identifier must be provided'):
+        with pytest.raises(ValueError, match='Model identifier must be provided'):
             ModelSourceConfig(identifier='')
 
     def test_validate_huggingface_empty(self):

From 54e4153a6d44e011fd76cdff02b11f5e7e47dac8 Mon Sep 17 00:00:00 2001
From: Aishwarya Tonpe <aishwarya.tonpe25@gmail.com>
Date: Tue, 2 Jun 2026 10:31:14 -0700
Subject: [PATCH 07/12] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 .../micro_benchmarks/_export_torch_to_onnx.py | 191 ++++++++++--------
 .../huggingface_model_loader.py               |  89 ++++----
 .../ort_inference_performance.py              |   2 +-
 .../micro_benchmarks/test_huggingface_e2e.py  |   7 +-
 4 files changed, 159 insertions(+), 130 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
index ab94f74e7..d395d18e0 100644
--- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
+++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
@@ -18,6 +18,7 @@
 
 class torch2onnxExporter():
     """PyTorch model to ONNX exporter."""
+
     def __init__(self):
         """Constructor."""
         from transformers import BertConfig, GPT2Config, LlamaConfig
@@ -314,95 +315,15 @@ def export_huggingface_model(self, model, model_name, batch_size=1, seq_length=5
             is_vision_model = main_input == 'pixel_values'
 
             if is_vision_model:
-                # Vision models: use pixel_values (batch_size, channels, height, width)
-                # Derive C/H/W from model config rather than hard-coding 3x224x224
-                num_channels = getattr(model.config, 'num_channels', 3)
-                image_size = getattr(model.config, 'image_size', 224)
-                if isinstance(image_size, (list, tuple)):
-                    img_h, img_w = image_size[0], image_size[1]
-                else:
-                    img_h, img_w = image_size, image_size
-
-                dummy_input = torch.randn(batch_size, num_channels, img_h, img_w, dtype=model_dtype, device=device)
-                input_names = ['pixel_values']
-                dynamic_axes = {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
-
-                # Wrapper for vision models
-                class VisionModelWrapper(torch.nn.Module):
-                    def __init__(self, model):
-                        super().__init__()
-                        self.model = model
-
-                    def forward(self, pixel_values):
-                        outputs = self.model(pixel_values=pixel_values)
-                        if hasattr(outputs, 'logits'):
-                            return outputs.logits
-                        elif hasattr(outputs, 'last_hidden_state'):
-                            return outputs.last_hidden_state
-                        else:
-                            return outputs[0] if isinstance(outputs, (tuple, list)) else outputs
-
-                wrapped_model = VisionModelWrapper(model)
-                export_args = (dummy_input, )
+                wrapped_model, export_args, input_names, dynamic_axes = self._build_vision_export_inputs(
+                    model, batch_size, model_dtype, device
+                )
             else:
-                # NLP models: use input_ids and attention_mask
-                dummy_input = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device)
-                attention_mask = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device)
-                input_names = ['input_ids', 'attention_mask']
-                dynamic_axes = {
-                    'input_ids': {
-                        0: 'batch_size',
-                        1: 'seq_length'
-                    },
-                    'attention_mask': {
-                        0: 'batch_size',
-                        1: 'seq_length'
-                    },
-                    'output': {
-                        0: 'batch_size',
-                        1: 'seq_length'
-                    },
-                }
+                wrapped_model, export_args, input_names, dynamic_axes = self._build_nlp_export_inputs(
+                    model, batch_size, seq_length, device
+                )
 
-                # Wrapper for NLP models
-                class NLPModelWrapper(torch.nn.Module):
-                    def __init__(self, model):
-                        super().__init__()
-                        self.model = model
-
-                    def forward(self, input_ids, attention_mask):
-                        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
-                        if hasattr(outputs, 'logits'):
-                            return outputs.logits
-                        elif hasattr(outputs, 'last_hidden_state'):
-                            return outputs.last_hidden_state
-                        else:
-                            return outputs[0] if isinstance(outputs, (tuple, list)) else outputs
-
-                wrapped_model = NLPModelWrapper(model)
-                export_args = (dummy_input, attention_mask)
-
-            # Export to ONNX for large models (>2GB), use external data format
-            model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3)
-            use_external_data = model_size_gb > 2.0
-
-            if use_external_data:
-                logger.info(f'Model size is {model_size_gb:.2f}GB, using external data format for ONNX export')
-
-            export_kwargs = {
-                'opset_version': 14,
-                'do_constant_folding': True,
-                'input_names': input_names,
-                'output_names': ['output'],
-                'dynamic_axes': dynamic_axes,
-            }
-            if use_external_data:
-                # PyTorch 2.8+ renamed 'use_external_data_format' to 'external_data'
-                sig = inspect.signature(torch.onnx.export)
-                if 'external_data' in sig.parameters:
-                    export_kwargs['external_data'] = True
-                else:
-                    export_kwargs['use_external_data_format'] = True
+            export_kwargs = self._build_onnx_export_kwargs(model, input_names, dynamic_axes)
 
             torch.onnx.export(
                 wrapped_model,
@@ -412,7 +333,7 @@ def forward(self, input_ids, attention_mask):
             )
 
             # Clean up
-            del dummy_input
+            del export_args
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
 
@@ -422,3 +343,97 @@ def forward(self, input_ids, attention_mask):
             logger.error(f'Failed to export HuggingFace model to ONNX: {str(e)}')
             logger.error(traceback.format_exc())
             return ''
+
+    def _build_vision_export_inputs(self, model, batch_size, model_dtype, device):
+        """Build the dummy inputs and wrapper module for exporting a vision HuggingFace model."""
+        # Vision models: use pixel_values (batch_size, channels, height, width)
+        # Derive C/H/W from model config rather than hard-coding 3x224x224
+        num_channels = getattr(model.config, 'num_channels', 3)
+        image_size = getattr(model.config, 'image_size', 224)
+        if isinstance(image_size, (list, tuple)):
+            img_h, img_w = image_size[0], image_size[1]
+        else:
+            img_h, img_w = image_size, image_size
+
+        dummy_input = torch.randn(batch_size, num_channels, img_h, img_w, dtype=model_dtype, device=device)
+        input_names = ['pixel_values']
+        dynamic_axes = {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
+
+        class VisionModelWrapper(torch.nn.Module):
+
+            def __init__(self, model):
+                super().__init__()
+                self.model = model
+
+            def forward(self, pixel_values):
+                outputs = self.model(pixel_values=pixel_values)
+                if hasattr(outputs, 'logits'):
+                    return outputs.logits
+                elif hasattr(outputs, 'last_hidden_state'):
+                    return outputs.last_hidden_state
+                else:
+                    return outputs[0] if isinstance(outputs, (tuple, list)) else outputs
+
+        return VisionModelWrapper(model), (dummy_input, ), input_names, dynamic_axes
+
+    def _build_nlp_export_inputs(self, model, batch_size, seq_length, device):
+        """Build the dummy inputs and wrapper module for exporting an NLP HuggingFace model."""
+        # NLP models: use input_ids and attention_mask
+        dummy_input = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device)
+        attention_mask = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device)
+        input_names = ['input_ids', 'attention_mask']
+        dynamic_axes = {
+            'input_ids': {
+                0: 'batch_size',
+                1: 'seq_length'
+            },
+            'attention_mask': {
+                0: 'batch_size',
+                1: 'seq_length'
+            },
+            'output': {
+                0: 'batch_size',
+                1: 'seq_length'
+            },
+        }
+
+        class NLPModelWrapper(torch.nn.Module):
+
+            def __init__(self, model):
+                super().__init__()
+                self.model = model
+
+            def forward(self, input_ids, attention_mask):
+                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+                if hasattr(outputs, 'logits'):
+                    return outputs.logits
+                elif hasattr(outputs, 'last_hidden_state'):
+                    return outputs.last_hidden_state
+                else:
+                    return outputs[0] if isinstance(outputs, (tuple, list)) else outputs
+
+        return NLPModelWrapper(model), (dummy_input, attention_mask), input_names, dynamic_axes
+
+    def _build_onnx_export_kwargs(self, model, input_names, dynamic_axes):
+        """Assemble torch.onnx.export kwargs, enabling external-data format for >2GB models."""
+        model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3)
+        use_external_data = model_size_gb > 2.0
+
+        if use_external_data:
+            logger.info(f'Model size is {model_size_gb:.2f}GB, using external data format for ONNX export')
+
+        export_kwargs = {
+            'opset_version': 14,
+            'do_constant_folding': True,
+            'input_names': input_names,
+            'output_names': ['output'],
+            'dynamic_axes': dynamic_axes,
+        }
+        if use_external_data:
+            # PyTorch 2.8+ renamed 'use_external_data_format' to 'external_data'
+            sig = inspect.signature(torch.onnx.export)
+            if 'external_data' in sig.parameters:
+                export_kwargs['external_data'] = True
+            else:
+                export_kwargs['use_external_data_format'] = True
+        return export_kwargs
diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
index c7f28a2eb..c72f598cd 100644
--- a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
+++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
@@ -85,6 +85,7 @@ class HuggingFaceModelLoader:
             ``False``; enabling this turns ``--model_identifier`` into an RCE
             sink, so it is opt-in only.
     """
+
     def __init__(
         self,
         cache_dir: Optional[str] = None,
@@ -150,19 +151,7 @@ def load_model(
         validate_model_identifier(model_identifier)
 
         try:
-            # Convert torch_dtype string to torch dtype
-            dtype = self._get_torch_dtype(torch_dtype) if torch_dtype else None
-
-            # Prepare loading kwargs
-            load_kwargs = {'cache_dir': self.cache_dir, 'revision': revision, **kwargs}
-
-            # Add token if available
-            if self.token:
-                load_kwargs['token'] = self.token
-
-            # Add dtype if specified
-            if dtype:
-                load_kwargs['torch_dtype'] = dtype
+            load_kwargs = self._build_load_kwargs(torch_dtype, revision, kwargs)
 
             # Load config (use pre-downloaded config if provided)
             if config is None:
@@ -173,35 +162,11 @@ def load_model(
             else:
                 logger.info('Using pre-downloaded model configuration.')
 
-            # Load tokenizer (may fail for some models, that's ok)
-            tokenizer = None
-            try:
-                logger.info('Loading tokenizer...')
-                tokenizer = AutoTokenizer.from_pretrained(
-                    model_identifier, trust_remote_code=self.allow_remote_code, **load_kwargs
-                )
-            except Exception as e:
-                logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.')
+            tokenizer = self._try_load_tokenizer(model_identifier, load_kwargs)
 
             # Load model
             logger.info(f'Loading model weights (dtype={torch_dtype}, device={device})...')
-            model_kwargs = load_kwargs.copy()
-            model_kwargs['trust_remote_code'] = self.allow_remote_code
-
-            # Handle device mapping for large models
-            effective_device_map = device_map
-            if device_map:
-                model_kwargs['device_map'] = device_map
-            elif device == 'cuda' and torch.cuda.is_available():
-                # Don't set device_map if device is explicitly cuda
-                pass
-            elif device != 'cpu':
-                model_kwargs['device_map'] = device
-                effective_device_map = device
-
-            # Pass pre-downloaded config to from_pretrained so any overrides take effect
-            if config is not None:
-                model_kwargs['config'] = config
+            model_kwargs, effective_device_map = self._build_model_kwargs(load_kwargs, device, device_map, config)
 
             try:
                 model = AutoModel.from_pretrained(model_identifier, **model_kwargs)
@@ -230,6 +195,52 @@ def load_model(
         except Exception as e:
             raise ModelLoadError(f"Unexpected error loading model '{model_identifier}': {e}") from e
 
+    def _build_load_kwargs(self, torch_dtype, revision, extra_kwargs):
+        """Assemble the base ``from_pretrained`` kwargs (cache_dir, token, dtype, revision)."""
+        dtype = self._get_torch_dtype(torch_dtype) if torch_dtype else None
+        load_kwargs = {'cache_dir': self.cache_dir, 'revision': revision, **extra_kwargs}
+        if self.token:
+            load_kwargs['token'] = self.token
+        if dtype:
+            load_kwargs['torch_dtype'] = dtype
+        return load_kwargs
+
+    def _try_load_tokenizer(self, model_identifier, load_kwargs):
+        """Attempt to load a tokenizer; return None if the model has no associated tokenizer."""
+        try:
+            logger.info('Loading tokenizer...')
+            return AutoTokenizer.from_pretrained(
+                model_identifier, trust_remote_code=self.allow_remote_code, **load_kwargs
+            )
+        except Exception as e:
+            logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.')
+            return None
+
+    def _build_model_kwargs(self, load_kwargs, device, device_map, config):
+        """Build model-loading kwargs and resolve the effective device_map.
+
+        Returns:
+            Tuple[dict, Optional[str]]: ``(model_kwargs, effective_device_map)``.
+        """
+        model_kwargs = load_kwargs.copy()
+        model_kwargs['trust_remote_code'] = self.allow_remote_code
+
+        effective_device_map = device_map
+        if device_map:
+            model_kwargs['device_map'] = device_map
+        elif device == 'cuda' and torch.cuda.is_available():
+            # Don't set device_map if device is explicitly cuda
+            pass
+        elif device != 'cpu':
+            model_kwargs['device_map'] = device
+            effective_device_map = device
+
+        # Pass pre-downloaded config to from_pretrained so any overrides take effect
+        if config is not None:
+            model_kwargs['config'] = config
+
+        return model_kwargs, effective_device_map
+
     def load_model_from_config(
         self,
         config: ModelSourceConfig,
diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
index 644d964f8..7c97b2bc3 100644
--- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
@@ -232,7 +232,7 @@ def _preprocess_huggingface_models(self):
             self._hf_config = hf_config
 
             precision_str = self._args.precision.value if self._args.precision != Precision.INT8 else 'float32'
-            fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits(
+            fits, _, _, _ = HuggingFaceModelLoader.check_memory_fits(
                 self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token
             )
             if not fits:
diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
index 22061bf7c..dc7cf7d62 100644
--- a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
+++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
@@ -16,8 +16,10 @@
 
 pytest.importorskip('transformers')
 
-from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader
-from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+# Imports below this point depend on `transformers` being available, so they
+# must be deferred until after the `importorskip` call above.
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader    # noqa: E402
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig    # noqa: E402
 
 
 @pytest.mark.skipif(
@@ -26,6 +28,7 @@
 )
 class TestHuggingFaceE2E:
     """End-to-end tests for HuggingFace model loading."""
+
     @pytest.fixture
     def loader(self, tmp_path):
         """Create a loader instance with an isolated per-test cache dir."""

From a5d845c89e0fc26b89b2590852af5399ff011240 Mon Sep 17 00:00:00 2001
From: Aishwarya Tonpe <aishwarya.tonpe25@gmail.com>
Date: Tue, 2 Jun 2026 10:55:55 -0700
Subject: [PATCH 08/12] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 .../benchmarks/micro_benchmarks/huggingface_model_loader.py   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
index c72f598cd..6e4a5511b 100644
--- a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
+++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
@@ -52,6 +52,10 @@ def validate_model_identifier(model_identifier: Optional[str]) -> str:
             "'^[A-Za-z0-9][A-Za-z0-9._-]{0,127}(/[A-Za-z0-9._-]{1,128})?$' "
             '(e.g. "bert-base-uncased" or "meta-llama/Llama-2-7b-hf").'
         )
+    if Path(model_identifier).exists():
+        raise ValueError(
+            f'Invalid model_identifier {model_identifier!r}. Refusing to treat an existing local path as a Hub repo id.'
+        )
     return model_identifier
 
 

From 09f14b27fdf32aa68725160aff6582a0fb7aba6d Mon Sep 17 00:00:00 2001
From: Aishwarya Tonpe <aishwarya.tonpe25@gmail.com>
Date: Tue, 2 Jun 2026 10:55:55 -0700
Subject: [PATCH 09/12] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 .../tensorrt_inference_performance.py         |  6 ++++++
 .../micro_benchmarks/_export_torch_to_onnx.py |  3 ---
 .../huggingface_model_loader.py               | 20 ++++++++++++++++---
 .../ort_inference_performance.py              |  2 +-
 .../tensorrt_inference_performance.py         |  7 ++++---
 .../micro_benchmarks/test_huggingface_e2e.py  |  1 -
 6 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/examples/benchmarks/tensorrt_inference_performance.py b/examples/benchmarks/tensorrt_inference_performance.py
index 4385a728e..1880ab242 100644
--- a/examples/benchmarks/tensorrt_inference_performance.py
+++ b/examples/benchmarks/tensorrt_inference_performance.py
@@ -91,6 +91,12 @@ def run_huggingface_benchmark(model_identifier, precision='fp16', batch_size=32,
     parser.add_argument('--iterations', type=int, default=2048)
     args = parser.parse_args()
 
+    if args.model_source == 'huggingface' and args.precision == 'int8':
+        parser.error(
+            '--precision int8 is not supported with --model_source huggingface '
+            '(no calibration data / Q-DQ ONNX is generated). Use fp16 or fp32.'
+        )
+
     if args.model_source == 'huggingface':
         run_huggingface_benchmark(
             args.model_identifier, args.precision, args.batch_size, args.seq_length, args.iterations
diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
index d395d18e0..28715db64 100644
--- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
+++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
@@ -18,7 +18,6 @@
 
 class torch2onnxExporter():
     """PyTorch model to ONNX exporter."""
-
     def __init__(self):
         """Constructor."""
         from transformers import BertConfig, GPT2Config, LlamaConfig
@@ -360,7 +359,6 @@ def _build_vision_export_inputs(self, model, batch_size, model_dtype, device):
         dynamic_axes = {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
 
         class VisionModelWrapper(torch.nn.Module):
-
             def __init__(self, model):
                 super().__init__()
                 self.model = model
@@ -398,7 +396,6 @@ def _build_nlp_export_inputs(self, model, batch_size, seq_length, device):
         }
 
         class NLPModelWrapper(torch.nn.Module):
-
             def __init__(self, model):
                 super().__init__()
                 self.model = model
diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
index 6e4a5511b..0937e40aa 100644
--- a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
+++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
@@ -89,7 +89,6 @@ class HuggingFaceModelLoader:
             ``False``; enabling this turns ``--model_identifier`` into an RCE
             sink, so it is opt-in only.
     """
-
     def __init__(
         self,
         cache_dir: Optional[str] = None,
@@ -154,6 +153,11 @@ def load_model(
         # Reject malformed / path-like identifiers before any network or disk activity.
         validate_model_identifier(model_identifier)
 
+        # Fall back to CPU on hosts without CUDA so default device='cuda' callers don't fail.
+        if device == 'cuda' and not torch.cuda.is_available():
+            logger.warning('CUDA not available; falling back to CPU.')
+            device = 'cpu'
+
         try:
             load_kwargs = self._build_load_kwargs(torch_dtype, revision, kwargs)
 
@@ -211,10 +215,12 @@ def _build_load_kwargs(self, torch_dtype, revision, extra_kwargs):
 
     def _try_load_tokenizer(self, model_identifier, load_kwargs):
         """Attempt to load a tokenizer; return None if the model has no associated tokenizer."""
+        # Tokenizers don't accept model-only kwargs like torch_dtype/device_map; strip before passing.
+        tokenizer_kwargs = {k: v for k, v in load_kwargs.items() if k not in ('torch_dtype', 'device_map')}
         try:
             logger.info('Loading tokenizer...')
             return AutoTokenizer.from_pretrained(
-                model_identifier, trust_remote_code=self.allow_remote_code, **load_kwargs
+                model_identifier, trust_remote_code=self.allow_remote_code, **tokenizer_kwargs
             )
         except Exception as e:
             logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.')
@@ -373,7 +379,15 @@ def estimate_param_count_from_config(hf_config) -> Optional[int]:
 
             # Embeddings: token + (optional) position
             max_pos = getattr(hf_config, 'max_position_embeddings', 0)
-            has_pos_embed = getattr(hf_config, 'position_embedding_type', None) not in ('rotary', None)
+            pos_embed_type = getattr(hf_config, 'position_embedding_type', None)
+            # When position_embedding_type is missing/None, default to assuming learned
+            # position embeddings exist (common for BERT-style configs that omit the field).
+            # Only skip the term when the type is explicitly rotary, or the config clearly
+            # indicates RoPE/rotary via rope_theta/rotary_pct/rotary_emb_base.
+            uses_rotary = pos_embed_type == 'rotary' or any(
+                getattr(hf_config, attr, None) is not None for attr in ('rope_theta', 'rotary_pct', 'rotary_emb_base')
+            )
+            has_pos_embed = not uses_rotary
             embed_params = vocab * hidden
             if has_pos_embed and max_pos > 0:
                 embed_params += max_pos * hidden
diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
index 7c97b2bc3..ee76def7d 100644
--- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
@@ -145,7 +145,7 @@ def add_parser_arguments(self):
             default=False,
             required=False,
             help='Allow HuggingFace to execute model-repo Python (trust_remote_code=True). '
-            'SECURITY: enables RCE from --model_identifier. Pin --revision <sha> when used.',
+            'SECURITY: enables RCE from --model_identifier; only enable for trusted model identifiers.',
         )
 
     def _preprocess(self):
diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
index 06d1dd6e0..d467c7987 100644
--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -104,7 +104,7 @@ def add_parser_arguments(self):
             default=False,
             required=False,
             help='Allow HuggingFace to execute model-repo Python (trust_remote_code=True). '
-            'SECURITY: enables RCE from --model_identifier. Pin --revision <sha> when used.',
+            'SECURITY: enables RCE from --model_identifier; only enable for trusted model identifiers.',
         )
 
     @staticmethod
@@ -238,9 +238,10 @@ def _preprocess_huggingface_models(self):
             hf_config = AutoConfig.from_pretrained(
                 self._args.model_identifier, trust_remote_code=allow_remote_code, **load_kwargs
             )
-            precision_str = self._args.precision    # already a string: 'fp16', 'fp32', 'int8'
+            # ONNX export is always done in float32 (see _build_trtexec_command_for_hf), so gate
+            # the pre-download check on fp32 memory regardless of the requested runtime precision.
             fits, _, _, _ = HuggingFaceModelLoader.check_memory_fits(
-                self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token
+                self._args.model_identifier, hf_config, 'fp32', mode='inference', token=hf_token
             )
             if not fits:
                 self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
index dc7cf7d62..3f2283bb5 100644
--- a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
+++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
@@ -28,7 +28,6 @@
 )
 class TestHuggingFaceE2E:
     """End-to-end tests for HuggingFace model loading."""
-
     @pytest.fixture
     def loader(self, tmp_path):
         """Create a loader instance with an isolated per-test cache dir."""

From 135f38159331f07d2f07ed8734b3c5d9a74fc1c7 Mon Sep 17 00:00:00 2001
From: Aishwarya Tonpe <aishwarya.tonpe25@gmail.com>
Date: Tue, 2 Jun 2026 10:55:55 -0700
Subject: [PATCH 10/12] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 .../huggingface_model_loader.py               |  2 +-
 .../ort_inference_performance.py              | 17 ++++++++++++----
 .../tensorrt_inference_performance.py         |  7 +++++++
 .../micro_benchmarks/test_huggingface_e2e.py  | 20 +++++++++++--------
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
index 0937e40aa..ba8454df0 100644
--- a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
+++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
@@ -350,7 +350,7 @@ def _get_model_size(self, model: PreTrainedModel) -> float:
         Returns:
             Number of parameters in millions.
         """
-        return float(sum(p.numel() for p in model.parameters())) / 1_000_000
+        return float(sum(p.numel() for p in model.parameters())) / 1_000_000    # type: ignore[attr-defined]
 
     @staticmethod
     def estimate_param_count_from_config(hf_config) -> Optional[int]:
diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
index ee76def7d..309c37b06 100644
--- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
@@ -23,6 +23,7 @@
 
 class ORTInferenceBenchmark(MicroBenchmark):
     """ONNXRuntime inference micro-benchmark class."""
+
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -240,7 +241,8 @@ def _preprocess_huggingface_models(self):
                 return False
 
             # Step 2: Export the model to ONNX (and quantize for INT8) on a per-rank path.
-            return self._export_hf_model_to_onnx(hf_token, allow_remote_code)
+            # Reuse the already-downloaded hf_config to avoid a redundant fetch in load_model_from_config.
+            return self._export_hf_model_to_onnx(hf_token, allow_remote_code, hf_config)
 
         except Exception as e:
             logger.error(f'Failed to prepare HuggingFace model: {str(e)}')
@@ -249,12 +251,13 @@ def _preprocess_huggingface_models(self):
             self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
             return False
 
-    def _export_hf_model_to_onnx(self, hf_token, allow_remote_code):
+    def _export_hf_model_to_onnx(self, hf_token, allow_remote_code, hf_config=None):
         """Download the HF model, export to ONNX, and apply INT8 quantization if requested.
 
         Args:
             hf_token (str | None): HuggingFace token, or None.
             allow_remote_code (bool): Whether to allow trust_remote_code on load.
+            hf_config: Pre-downloaded HF config to reuse; avoids a redundant fetch.
 
         Returns:
             bool: True on success; False (with return code set) on failure.
@@ -277,9 +280,9 @@ def _export_hf_model_to_onnx(self, hf_token, allow_remote_code):
             device_map=None,
         )
 
-        # Load model from HuggingFace on CPU
+        # Load model from HuggingFace on CPU, reusing the preloaded config when available.
         loader = HuggingFaceModelLoader(allow_remote_code=allow_remote_code)
-        hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu')
+        hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu', config_pretrained=hf_config)
         from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter
         exporter = torch2onnxExporter()
 
@@ -320,6 +323,12 @@ def _export_hf_model_to_onnx(self, hf_token, allow_remote_code):
             self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
             return False
 
+        # Release the torch model now that ONNX export is done; export_huggingface_model() may
+        # have moved it onto GPU, and we don't want it occupying VRAM during ORT session creation.
+        del hf_model
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
         # Apply INT8 quantization if requested (matching in-house model behavior)
         if self._args.precision == Precision.INT8:
             from onnxruntime.quantization import quantize_dynamic
diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
index d467c7987..386e84dc0 100644
--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -23,6 +23,7 @@
 
 class TensorRTInferenceBenchmark(MicroBenchmarkWithInvoke):
     """TensorRT inference micro-benchmark class."""
+
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -321,6 +322,12 @@ def _build_trtexec_command_for_hf(self, hf_token, allow_remote_code):
             self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
             return False
 
+        # Release the torch model now that ONNX export is done; export_huggingface_model() may
+        # have moved it onto GPU, and we don't want it holding VRAM while trtexec builds the engine.
+        del hf_model
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
         input_shapes = self._derive_trt_input_shapes(onnx_path)
 
         # Build TensorRT command with correct input name
diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
index 3f2283bb5..d9922d72e 100644
--- a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
+++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
@@ -18,8 +18,12 @@
 
 # Imports below this point depend on `transformers` being available, so they
 # must be deferred until after the `importorskip` call above.
-from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader    # noqa: E402
-from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig    # noqa: E402
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import (    # noqa: E402
+    HuggingFaceModelLoader,
+)
+from superbench.benchmarks.micro_benchmarks.model_source_config import (    # noqa: E402
+    ModelSourceConfig,
+)
 
 
 @pytest.mark.skipif(
@@ -38,7 +42,7 @@ def test_load_tiny_bert_model(self, loader):
 
         Uses prajjwal1/bert-tiny which is a small public BERT model (~17MB).
         """
-        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+        model, config, _ = loader.load_model('prajjwal1/bert-tiny', device='cpu')
 
         assert model is not None
         assert config is not None
@@ -55,7 +59,7 @@ def test_load_distilgpt2_model(self, loader):
 
         Uses distilbert/distilgpt2 which is a small public GPT-2 model (~82MB).
         """
-        model, config, tokenizer = loader.load_model('distilbert/distilgpt2', device='cpu')
+        model, config, _ = loader.load_model('distilbert/distilgpt2', device='cpu')
 
         assert model is not None
         assert config is not None
@@ -71,14 +75,14 @@ def test_load_model_from_config(self, loader):
         """Test loading model using ModelSourceConfig via load_model_from_config."""
         config = ModelSourceConfig(source='huggingface', identifier='prajjwal1/bert-tiny', torch_dtype='float32')
 
-        model, hf_config, tokenizer = loader.load_model_from_config(config, device='cpu')
+        model, hf_config, _ = loader.load_model_from_config(config, device='cpu')
 
         assert model is not None
         assert hf_config.model_type == 'bert'
 
     def test_load_model_with_dtype(self, loader):
         """Test loading model and converting dtype after load."""
-        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+        model, _, _ = loader.load_model('prajjwal1/bert-tiny', device='cpu')
 
         # Convert to float32 after loading
         model = model.float()
@@ -90,7 +94,7 @@ def test_load_model_with_dtype(self, loader):
     @pytest.mark.skipif(not torch.cuda.is_available(), reason='Requires GPU')
     def test_load_model_to_gpu(self, loader):
         """Test loading model and moving to GPU."""
-        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+        model, _, _ = loader.load_model('prajjwal1/bert-tiny', device='cpu')
 
         # Move to GPU manually
         model = model.cuda()
@@ -101,7 +105,7 @@ def test_load_model_to_gpu(self, loader):
 
     def test_architecture_detection(self, loader):
         """Test that architecture is correctly detected from loaded model."""
-        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+        _, config, _ = loader.load_model('prajjwal1/bert-tiny', device='cpu')
 
         # Architecture should be detected from config
         assert config.model_type is not None

From c3744390ce6138c40aa4f8e862232ecab6120c41 Mon Sep 17 00:00:00 2001
From: Aishwarya Tonpe <aishwarya.tonpe25@gmail.com>
Date: Tue, 2 Jun 2026 10:55:55 -0700
Subject: [PATCH 11/12] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 .../test_tensorrt_inference_performance.py    | 451 ++++++++++++++++++
 1 file changed, 451 insertions(+)

diff --git a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
index 301a4a08d..441be7af1 100644
--- a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
@@ -6,6 +6,7 @@
 import unittest
 from pathlib import Path
 from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
 
 from tests.helper import decorator
 from tests.helper.testcase import BenchmarkTestCase
@@ -13,6 +14,31 @@
 from superbench.benchmarks.result import BenchmarkResult
 
 
+def _make_onnx_dim(value):
+    """Build an ONNX-graph-input-style dim mock that exposes ``dim_value``."""
+    return SimpleNamespace(dim_value=value)
+
+
+def _make_onnx_input(name, dims):
+    """Build an ONNX-graph-input mock with the given name and dim values.
+
+    A ``dim_value`` of ``0`` mimics a dynamic/unknown dimension, matching how
+    ``onnx`` represents symbolic dims (``dim_param`` set, ``dim_value`` == 0).
+    """
+    return SimpleNamespace(
+        name=name,
+        type=SimpleNamespace(
+            tensor_type=SimpleNamespace(shape=SimpleNamespace(dim=[_make_onnx_dim(d) for d in dims]))
+        ),
+    )
+
+
+def _make_onnx_model(inputs, initializer_names=()):
+    """Build an ONNX model mock with the given graph inputs and initializers."""
+    initializers = [SimpleNamespace(name=n) for n in initializer_names]
+    return SimpleNamespace(graph=SimpleNamespace(input=list(inputs), initializer=initializers))
+
+
 class TensorRTInferenceBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase):
     """Class for tensorrt-inferencee benchmark test cases."""
     @classmethod
@@ -144,3 +170,428 @@ def test_tensorrt_inference_result_parsing(self, test_raw_log_1, test_raw_log_2)
 
         # Negative case - invalid raw output
         self.assertFalse(benchmark._process_raw_result(1, 'Invalid raw output'))
+
+
+_TENSORRT_MODULE = 'superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance'
+
+
+class TensorRTInferenceHuggingFaceTestCase(unittest.TestCase):
+    """Unit tests for the HuggingFace-specific helpers on TensorRTInferenceBenchmark.
+
+    These tests exercise the methods that previously had zero coverage:
+    ``_preprocess_huggingface_models``, ``_build_trtexec_command_for_hf``,
+    ``_derive_trt_input_shapes``, ``_derive_vision_input_shape``, and
+    ``_derive_nlp_input_shapes``. They are pure unit tests (no CUDA / no HF
+    network) and rely on mocking the model loader, ONNX exporter, and the
+    ``onnx`` loader to keep them fast and deterministic.
+    """
+
+    benchmark_name = 'tensorrt-inference'
+
+    def _make_benchmark(self, **arg_overrides):
+        """Build a benchmark instance with mock args and bin/workspace state.
+
+        Mimics the post-``_preprocess`` state of the object (bin path and
+        workspace flag already resolved) without actually invoking trtexec or
+        touching the filesystem.
+        """
+        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
+            self.benchmark_name, Platform.CUDA
+        )
+        benchmark = benchmark_cls(self.benchmark_name, parameters='')
+        benchmark._result = BenchmarkResult(
+            self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1
+        )
+        defaults = dict(
+            model_source='huggingface',
+            model_identifier='prajjwal1/bert-tiny',
+            allow_remote_code=False,
+            precision='fp16',
+            batch_size=8,
+            seq_length=128,
+            iterations=128,
+            pytorch_models=[],
+            log_raw_data=False,
+        )
+        defaults.update(arg_overrides)
+        benchmark._args = SimpleNamespace(**defaults)
+        # Set name-mangled private attributes that _preprocess() normally fills in.
+        benchmark._TensorRTInferenceBenchmark__bin_path = '/fake/bin/trtexec'
+        benchmark._TensorRTInferenceBenchmark__workspace_flag = '--memPoolSize=workspace:8192M'
+        benchmark._commands = []
+        return benchmark
+
+    # ------------------------------------------------------------------
+    # _preprocess_huggingface_models
+    # ------------------------------------------------------------------
+
+    def test_preprocess_hf_missing_model_identifier(self):
+        """Missing --model_identifier is rejected before any HF I/O."""
+        benchmark = self._make_benchmark(model_identifier=None)
+
+        self.assertFalse(benchmark._preprocess_huggingface_models())
+        self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code)
+        self.assertEqual([], benchmark._commands)
+
+    def test_preprocess_hf_invalid_identifier(self):
+        """Path-like / unsafe identifier is rejected by validate_model_identifier."""
+        benchmark = self._make_benchmark(model_identifier='../etc/passwd')
+
+        self.assertFalse(benchmark._preprocess_huggingface_models())
+        self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code)
+
+    def test_preprocess_hf_int8_rejected(self):
+        """INT8 on the HF path is rejected (no calibration data emitted)."""
+        benchmark = self._make_benchmark(precision='int8')
+
+        self.assertFalse(benchmark._preprocess_huggingface_models())
+        self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code)
+
+    def test_preprocess_hf_memory_check_fails(self):
+        """When check_memory_fits reports fits=False, preprocess fails."""
+        benchmark = self._make_benchmark()
+
+        fake_config = MagicMock(name='AutoConfigInstance')
+        with patch('transformers.AutoConfig') as mock_auto_config, \
+                patch(f'{_TENSORRT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls:
+            mock_auto_config.from_pretrained.return_value = fake_config
+            mock_loader_cls.check_memory_fits.return_value = (False, 1000.0, 30.0, 16.0)
+
+            self.assertFalse(benchmark._preprocess_huggingface_models())
+
+        self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code)
+        mock_auto_config.from_pretrained.assert_called_once()
+
+    def test_preprocess_hf_auto_config_exception(self):
+        """An exception while downloading the config is caught and converted to failure."""
+        benchmark = self._make_benchmark()
+
+        with patch('transformers.AutoConfig') as mock_auto_config:
+            mock_auto_config.from_pretrained.side_effect = RuntimeError('boom')
+
+            self.assertFalse(benchmark._preprocess_huggingface_models())
+
+        self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code)
+
+    def test_preprocess_hf_happy_path_delegates_to_build_command(self):
+        """Happy path: config + memory check pass and the build helper is invoked."""
+        benchmark = self._make_benchmark()
+
+        fake_config = MagicMock(name='AutoConfigInstance')
+        with patch('transformers.AutoConfig') as mock_auto_config, \
+                patch(f'{_TENSORRT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls, \
+                patch.object(
+                    benchmark, '_build_trtexec_command_for_hf', return_value=True
+                ) as mock_build:
+            mock_auto_config.from_pretrained.return_value = fake_config
+            mock_loader_cls.check_memory_fits.return_value = (True, 4.0, 0.02, 16.0)
+
+            self.assertTrue(benchmark._preprocess_huggingface_models())
+
+        # AutoConfig must be called with trust_remote_code matching --allow_remote_code (False here).
+        config_kwargs = mock_auto_config.from_pretrained.call_args.kwargs
+        self.assertFalse(config_kwargs['trust_remote_code'])
+        # Memory check must run for fp32 (ONNX export dtype) regardless of --precision.
+        mem_args, mem_kwargs = mock_loader_cls.check_memory_fits.call_args
+        self.assertEqual('fp32', mem_args[2])
+        self.assertEqual('inference', mem_kwargs.get('mode'))
+        mock_build.assert_called_once()
+
+    def test_preprocess_hf_allow_remote_code_propagates(self):
+        """--allow_remote_code is forwarded as trust_remote_code=True."""
+        benchmark = self._make_benchmark(allow_remote_code=True)
+
+        with patch('transformers.AutoConfig') as mock_auto_config, \
+                patch(f'{_TENSORRT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls, \
+                patch.object(benchmark, '_build_trtexec_command_for_hf', return_value=True):
+            mock_auto_config.from_pretrained.return_value = MagicMock()
+            mock_loader_cls.check_memory_fits.return_value = (True, 1.0, 0.01, 16.0)
+
+            benchmark._preprocess_huggingface_models()
+
+        self.assertTrue(mock_auto_config.from_pretrained.call_args.kwargs['trust_remote_code'])
+
+    # ------------------------------------------------------------------
+    # _build_trtexec_command_for_hf
+    # ------------------------------------------------------------------
+
+    def _patch_build_dependencies(self, onnx_path='/tmp/fake.onnx', input_shapes='input_ids:8x128'):
+        """Common patch context for _build_trtexec_command_for_hf tests."""
+        loader_patch = patch(f'{_TENSORRT_MODULE}.HuggingFaceModelLoader')
+        msc_patch = patch(f'{_TENSORRT_MODULE}.ModelSourceConfig')
+        exporter_patch = patch(f'{_TENSORRT_MODULE}.torch2onnxExporter')
+        makedirs_patch = patch(f'{_TENSORRT_MODULE}.os.makedirs')
+        torch_patch = patch(f'{_TENSORRT_MODULE}.torch')
+        return loader_patch, msc_patch, exporter_patch, makedirs_patch, torch_patch
+
+    def test_build_trtexec_command_for_hf_success(self):
+        """Happy path: command is appended and shape/precision flags are correct."""
+        benchmark = self._make_benchmark(precision='fp16')
+
+        loader_p, msc_p, exporter_p, makedirs_p, torch_p = self._patch_build_dependencies()
+        derived_shapes = 'input_ids:8x128,attention_mask:8x128'
+        with loader_p as mock_loader_cls, msc_p as mock_msc, exporter_p as mock_exporter_cls, \
+                makedirs_p as mock_makedirs, torch_p as mock_torch, \
+                patch.object(benchmark, '_derive_trt_input_shapes', return_value=derived_shapes) as mock_derive:
+            mock_torch.hub.get_dir.return_value = '/tmp/torchhub'
+            mock_torch.cuda.is_available.return_value = False
+
+            mock_loader = MagicMock()
+            mock_loader_cls.return_value = mock_loader
+            mock_hf_model = MagicMock(name='HFModel')
+            mock_hf_config = MagicMock(name='HFConfig')
+            mock_loader.load_model_from_config.return_value = (mock_hf_model, mock_hf_config, None)
+
+            mock_exporter = MagicMock()
+            mock_exporter_cls.return_value = mock_exporter
+            mock_exporter.export_huggingface_model.return_value = '/tmp/torchhub/checkpoints/trt_rank_0/m.onnx'
+
+            ok = benchmark._build_trtexec_command_for_hf(hf_token=None, allow_remote_code=False)
+
+        self.assertTrue(ok)
+        self.assertIs(benchmark._hf_config, mock_hf_config)
+        # makedirs called once with the rank-scoped output dir.
+        mock_makedirs.assert_called_once()
+        self.assertTrue(mock_makedirs.call_args.args[0].endswith('trt_rank_0'))
+        # ModelSourceConfig is constructed with float32 + device_map=None (CPU load).
+        msc_kwargs = mock_msc.call_args.kwargs
+        self.assertEqual('float32', msc_kwargs['torch_dtype'])
+        self.assertIsNone(msc_kwargs['device_map'])
+        self.assertEqual('huggingface', msc_kwargs['source'])
+        # Exporter called with the configured batch_size / seq_length.
+        export_kwargs = mock_exporter.export_huggingface_model.call_args.kwargs
+        self.assertEqual(8, export_kwargs['batch_size'])
+        self.assertEqual(128, export_kwargs['seq_length'])
+        # _derive_trt_input_shapes was invoked with the exported ONNX path.
+        mock_derive.assert_called_once_with('/tmp/torchhub/checkpoints/trt_rank_0/m.onnx')
+        # Exactly one command appended, containing the expected flags.
+        self.assertEqual(1, len(benchmark._commands))
+        cmd = benchmark._commands[0]
+        self.assertIn('/fake/bin/trtexec', cmd)
+        self.assertIn('--onnx=/tmp/torchhub/checkpoints/trt_rank_0/m.onnx', cmd)
+        self.assertIn(f'--optShapes={derived_shapes}', cmd)
+        self.assertIn('--memPoolSize=workspace:8192M', cmd)
+        self.assertIn('--fp16', cmd)
+        self.assertIn('--iterations=128', cmd)
+        self.assertIn('--percentile=99', cmd)
+        # pytorch_models is rewritten so _process_raw_result can key off the HF id.
+        self.assertEqual(['prajjwal1_bert-tiny'], benchmark._args.pytorch_models)
+
+    def test_build_trtexec_command_for_hf_fp32_omits_precision_flag(self):
+        """fp32 precision must not emit a ``--fp32`` or ``--int8`` flag."""
+        benchmark = self._make_benchmark(precision='fp32')
+
+        loader_p, msc_p, exporter_p, makedirs_p, torch_p = self._patch_build_dependencies()
+        with loader_p as mock_loader_cls, msc_p, exporter_p as mock_exporter_cls, \
+                makedirs_p, torch_p as mock_torch, \
+                patch.object(benchmark, '_derive_trt_input_shapes', return_value='input_ids:8x128'):
+            mock_torch.hub.get_dir.return_value = '/tmp/torchhub'
+            mock_torch.cuda.is_available.return_value = False
+            mock_loader_cls.return_value.load_model_from_config.return_value = (MagicMock(), MagicMock(), None)
+            mock_exporter_cls.return_value.export_huggingface_model.return_value = (
+                '/tmp/torchhub/checkpoints/trt_rank_0/m.onnx'
+            )
+
+            self.assertTrue(benchmark._build_trtexec_command_for_hf(None, False))
+
+        cmd = benchmark._commands[0]
+        self.assertNotIn('--fp16', cmd)
+        self.assertNotIn('--fp32', cmd)
+        self.assertNotIn('--int8', cmd)
+
+    def test_build_trtexec_command_for_hf_export_failure(self):
+        """If ONNX export returns falsy, the build fails and no command is queued."""
+        benchmark = self._make_benchmark()
+
+        loader_p, msc_p, exporter_p, makedirs_p, torch_p = self._patch_build_dependencies()
+        with loader_p as mock_loader_cls, msc_p, exporter_p as mock_exporter_cls, \
+                makedirs_p, torch_p as mock_torch:
+            mock_torch.hub.get_dir.return_value = '/tmp/torchhub'
+            mock_torch.cuda.is_available.return_value = False
+            mock_loader_cls.return_value.load_model_from_config.return_value = (MagicMock(), MagicMock(), None)
+            mock_exporter_cls.return_value.export_huggingface_model.return_value = None
+
+            self.assertFalse(benchmark._build_trtexec_command_for_hf(None, False))
+
+        self.assertEqual([], benchmark._commands)
+        self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code)
+
+    def test_build_trtexec_command_for_hf_uses_proc_rank_env(self):
+        """PROC_RANK env var (or CUDA_VISIBLE_DEVICES) controls the rank subdir."""
+        benchmark = self._make_benchmark()
+
+        loader_p, msc_p, exporter_p, makedirs_p, torch_p = self._patch_build_dependencies()
+        with loader_p as mock_loader_cls, msc_p, exporter_p as mock_exporter_cls, \
+                makedirs_p as mock_makedirs, torch_p as mock_torch, \
+                patch.object(benchmark, '_derive_trt_input_shapes', return_value='input_ids:8x128'), \
+                patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0', 'PROC_RANK': '3'}, clear=False):
+            mock_torch.hub.get_dir.return_value = '/tmp/torchhub'
+            mock_torch.cuda.is_available.return_value = False
+            mock_loader_cls.return_value.load_model_from_config.return_value = (MagicMock(), MagicMock(), None)
+            mock_exporter_cls.return_value.export_huggingface_model.return_value = (
+                '/tmp/torchhub/checkpoints/trt_rank_3/m.onnx'
+            )
+
+            self.assertTrue(benchmark._build_trtexec_command_for_hf(None, False))
+
+        self.assertTrue(mock_makedirs.call_args.args[0].endswith('trt_rank_3'))
+
+    # ------------------------------------------------------------------
+    # _derive_trt_input_shapes
+    # ------------------------------------------------------------------
+
+    def test_derive_trt_input_shapes_vision_by_pixel_values_name(self):
+        """Inputs named ``pixel_values`` are routed to the vision helper."""
+        benchmark = self._make_benchmark(batch_size=4)
+        # 3D so we rely on the name heuristic, not the dim-count heuristic.
+        vision_input = _make_onnx_input('pixel_values', [0, 3, 224])
+        # Pad the input to 4D so vision helper can index dims[1..3] safely.
+        vision_input_4d = _make_onnx_input('pixel_values', [0, 3, 224, 224])
+        model = _make_onnx_model([vision_input_4d])
+
+        with patch('onnx.load', return_value=model):
+            shapes = benchmark._derive_trt_input_shapes('/tmp/fake.onnx')
+
+        self.assertEqual('pixel_values:4x3x224x224', shapes)
+        _ = vision_input    # silence unused-warning in case of future refactor
+
+    def test_derive_trt_input_shapes_vision_by_4d_shape(self):
+        """A 4D non-``pixel_values`` input is still treated as vision."""
+        benchmark = self._make_benchmark(batch_size=2)
+        vision_input = _make_onnx_input('image', [0, 3, 256, 256])
+        model = _make_onnx_model([vision_input])
+
+        with patch('onnx.load', return_value=model):
+            shapes = benchmark._derive_trt_input_shapes('/tmp/fake.onnx')
+
+        self.assertEqual('image:2x3x256x256', shapes)
+
+    def test_derive_trt_input_shapes_nlp_multi_input(self):
+        """NLP routing: 2D inputs are emitted as ``name:BxS`` and comma-joined."""
+        benchmark = self._make_benchmark(batch_size=4, seq_length=64)
+        inputs = [
+            _make_onnx_input('input_ids', [0, 0]),
+            _make_onnx_input('attention_mask', [0, 0]),
+        ]
+        model = _make_onnx_model(inputs)
+
+        with patch('onnx.load', return_value=model):
+            shapes = benchmark._derive_trt_input_shapes('/tmp/fake.onnx')
+
+        self.assertEqual('input_ids:4x64,attention_mask:4x64', shapes)
+
+    def test_derive_trt_input_shapes_filters_initializers(self):
+        """Initializer-named graph inputs are excluded from runtime inputs."""
+        benchmark = self._make_benchmark(batch_size=1, seq_length=16)
+        runtime = _make_onnx_input('input_ids', [0, 0])
+        weight = _make_onnx_input('weight', [768, 768])
+        model = _make_onnx_model([weight, runtime], initializer_names=['weight'])
+
+        with patch('onnx.load', return_value=model):
+            shapes = benchmark._derive_trt_input_shapes('/tmp/fake.onnx')
+
+        self.assertEqual('input_ids:1x16', shapes)
+
+    def test_derive_trt_input_shapes_no_runtime_inputs_raises(self):
+        """A graph with only initializer-shadowed inputs raises ValueError."""
+        benchmark = self._make_benchmark()
+        weight = _make_onnx_input('weight', [768, 768])
+        model = _make_onnx_model([weight], initializer_names=['weight'])
+
+        with patch('onnx.load', return_value=model):
+            with self.assertRaises(ValueError):
+                benchmark._derive_trt_input_shapes('/tmp/fake.onnx')
+
+    # ------------------------------------------------------------------
+    # _derive_vision_input_shape
+    # ------------------------------------------------------------------
+
+    def test_derive_vision_input_shape_static_dims(self):
+        """Static ONNX dims are used verbatim (apart from the batch dim)."""
+        benchmark = self._make_benchmark(batch_size=16)
+        vision_input = _make_onnx_input('pixel_values', [0, 3, 384, 384])
+
+        result = benchmark._derive_vision_input_shape(vision_input, 'pixel_values')
+
+        self.assertEqual('pixel_values:16x3x384x384', result)
+
+    def test_derive_vision_input_shape_dynamic_with_hf_config_scalar(self):
+        """Dynamic dims fall back to ``_hf_config`` (scalar ``image_size``)."""
+        benchmark = self._make_benchmark(batch_size=4)
+        benchmark._hf_config = SimpleNamespace(num_channels=1, image_size=160)
+        vision_input = _make_onnx_input('pixel_values', [0, 0, 0, 0])
+
+        result = benchmark._derive_vision_input_shape(vision_input, 'pixel_values')
+
+        self.assertEqual('pixel_values:4x1x160x160', result)
+
+    def test_derive_vision_input_shape_dynamic_with_hf_config_tuple(self):
+        """Dynamic dims fall back to ``_hf_config`` (tuple/list ``image_size``)."""
+        benchmark = self._make_benchmark(batch_size=2)
+        benchmark._hf_config = SimpleNamespace(num_channels=3, image_size=(192, 384))
+        vision_input = _make_onnx_input('pixel_values', [0, 0, 0, 0])
+
+        result = benchmark._derive_vision_input_shape(vision_input, 'pixel_values')
+
+        self.assertEqual('pixel_values:2x3x192x384', result)
+
+    def test_derive_vision_input_shape_dynamic_without_hf_config_uses_defaults(self):
+        """No ``_hf_config`` + dynamic dims → default (3, 224, 224)."""
+        benchmark = self._make_benchmark(batch_size=1)
+        # Ensure no _hf_config is set.
+        if hasattr(benchmark, '_hf_config'):
+            del benchmark._hf_config
+        vision_input = _make_onnx_input('pixel_values', [0, 0, 0, 0])
+
+        result = benchmark._derive_vision_input_shape(vision_input, 'pixel_values')
+
+        self.assertEqual('pixel_values:1x3x224x224', result)
+
+    # ------------------------------------------------------------------
+    # _derive_nlp_input_shapes
+    # ------------------------------------------------------------------
+
+    def test_derive_nlp_input_shapes_single_2d(self):
+        """A single 2D input emits a single ``name:BxS`` entry."""
+        benchmark = self._make_benchmark(batch_size=8, seq_length=256)
+        inputs = [_make_onnx_input('input_ids', [0, 0])]
+
+        result = benchmark._derive_nlp_input_shapes(inputs)
+
+        self.assertEqual('input_ids:8x256', result)
+
+    def test_derive_nlp_input_shapes_multiple_inputs(self):
+        """Multiple inputs are joined with commas in declaration order."""
+        benchmark = self._make_benchmark(batch_size=4, seq_length=64)
+        inputs = [
+            _make_onnx_input('input_ids', [0, 0]),
+            _make_onnx_input('attention_mask', [0, 0]),
+            _make_onnx_input('token_type_ids', [0, 0]),
+        ]
+
+        result = benchmark._derive_nlp_input_shapes(inputs)
+
+        self.assertEqual(
+            'input_ids:4x64,attention_mask:4x64,token_type_ids:4x64',
+            result,
+        )
+
+    def test_derive_nlp_input_shapes_4d_input_uses_bx1xsxs(self):
+        """A 4D input (rare for NLP) gets the ``Bx1xSxS`` shape."""
+        benchmark = self._make_benchmark(batch_size=2, seq_length=32)
+        inputs = [_make_onnx_input('attention_bias', [0, 0, 0, 0])]
+
+        result = benchmark._derive_nlp_input_shapes(inputs)
+
+        self.assertEqual('attention_bias:2x1x32x32', result)
+
+    def test_derive_nlp_input_shapes_default_seq_length_when_missing(self):
+        """When ``_args.seq_length`` is absent, the helper defaults to 512."""
+        benchmark = self._make_benchmark()
+        # Strip seq_length to trigger the getattr-default branch.
+        del benchmark._args.seq_length
+        inputs = [_make_onnx_input('input_ids', [0, 0])]
+
+        result = benchmark._derive_nlp_input_shapes(inputs)
+
+        self.assertEqual('input_ids:8x512', result)

From 466f5ad6494153003a19c798d3c02cbe92d0c905 Mon Sep 17 00:00:00 2001
From: Aishwarya-Tonpe <aishwarya.tonpe25@gmail.com>
Date: Thu, 4 Jun 2026 18:13:41 +0000
Subject: [PATCH 12/12] Increasing test coverage by adding new tests - coverage
 fell short due to AI suggested comments resolution code

---
 .../ort_inference_performance.py              |   1 -
 .../tensorrt_inference_performance.py         |   1 -
 .../test_export_torch_to_onnx.py              | 450 ++++++++++++++++++
 .../test_ort_inference_performance.py         | 306 ++++++++++++
 .../test_tensorrt_inference_performance.py    |  13 +-
 5 files changed, 760 insertions(+), 11 deletions(-)
 create mode 100644 tests/benchmarks/micro_benchmarks/test_export_torch_to_onnx.py

diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
index 309c37b06..37f95a1ab 100644
--- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
@@ -23,7 +23,6 @@
 
 class ORTInferenceBenchmark(MicroBenchmark):
     """ONNXRuntime inference micro-benchmark class."""
-
     def __init__(self, name, parameters=''):
         """Constructor.
 
diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
index 386e84dc0..5e09a8f1c 100644
--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -23,7 +23,6 @@
 
 class TensorRTInferenceBenchmark(MicroBenchmarkWithInvoke):
     """TensorRT inference micro-benchmark class."""
-
     def __init__(self, name, parameters=''):
         """Constructor.
 
diff --git a/tests/benchmarks/micro_benchmarks/test_export_torch_to_onnx.py b/tests/benchmarks/micro_benchmarks/test_export_torch_to_onnx.py
new file mode 100644
index 000000000..a5ac5b0ce
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_export_torch_to_onnx.py
@@ -0,0 +1,450 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Unit tests for ``torch2onnxExporter`` HuggingFace export helpers.
+
+Covers:
+- ``export_huggingface_model`` (orchestration, error path, vision/NLP routing).
+- ``_build_vision_export_inputs`` (config-driven C/H/W, VisionModelWrapper).
+- ``_build_nlp_export_inputs`` (input_ids + attention_mask, NLPModelWrapper).
+- ``_build_onnx_export_kwargs`` (opset/dynamic_axes; external-data branch).
+
+Tests are pure-CPU and pure-unit: ``torch.onnx.export`` is patched out so we
+never touch the ONNX runtime, and dummy ``torch.nn.Module`` instances stand in
+for HuggingFace models.
+"""
+
+import inspect
+from pathlib import Path
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter
+
+_EXPORTER_MODULE = 'superbench.benchmarks.micro_benchmarks._export_torch_to_onnx'
+
+# ---------------------------------------------------------------------------
+# Fixtures / helpers
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def exporter(tmp_path, monkeypatch):
+    """Build a torch2onnxExporter rooted at a tmp dir to avoid touching the real torch hub."""
+    monkeypatch.setattr(torch.hub, 'get_dir', lambda: str(tmp_path))
+    return torch2onnxExporter()
+
+
+class _TinyVisionModel(torch.nn.Module):
+    """Minimal stand-in for a HuggingFace vision model.
+
+    Mimics enough of the HF API for the export helpers: ``main_input_name``,
+    a ``config`` namespace, and a ``forward`` that accepts ``pixel_values`` and
+    returns an object with a ``logits`` attribute.
+    """
+
+    main_input_name = 'pixel_values'
+
+    def __init__(self, num_channels=3, image_size=224, num_classes=4):
+        super().__init__()
+        self.config = SimpleNamespace(
+            num_channels=num_channels,
+            image_size=image_size,
+            use_cache=True,
+        )
+        # A trivial trainable parameter so .parameters() / .element_size() are exercised.
+        self.linear = torch.nn.Linear(num_channels, num_classes)
+
+    def forward(self, pixel_values):
+        # Reduce H/W and project channel dim, mimicking a classifier head.
+        flat = pixel_values.mean(dim=(2, 3))
+        return SimpleNamespace(logits=self.linear(flat))
+
+
+class _TinyNLPModel(torch.nn.Module):
+    """Minimal stand-in for a HuggingFace NLP model with input_ids + attention_mask."""
+
+    main_input_name = 'input_ids'
+
+    def __init__(self, vocab_size=128, hidden=8):
+        super().__init__()
+        self.config = SimpleNamespace(use_cache=True)
+        self.embed = torch.nn.Embedding(vocab_size, hidden)
+
+    def forward(self, input_ids, attention_mask):
+        h = self.embed(input_ids)
+        # last_hidden_state path is exercised here.
+        return SimpleNamespace(last_hidden_state=h * attention_mask.unsqueeze(-1).to(h.dtype))
+
+
+# ---------------------------------------------------------------------------
+# _build_vision_export_inputs
+# ---------------------------------------------------------------------------
+
+
+def test_build_vision_export_inputs_default_shape(exporter):
+    """Default config (3 channels, 224 image_size) → (B, 3, 224, 224) tensor."""
+    model = _TinyVisionModel(num_channels=3, image_size=224)
+
+    wrapped, args, names, axes = exporter._build_vision_export_inputs(
+        model, batch_size=2, model_dtype=torch.float32, device='cpu'
+    )
+
+    assert names == ['pixel_values']
+    assert axes == {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
+    assert len(args) == 1
+    pixel_values = args[0]
+    assert tuple(pixel_values.shape) == (2, 3, 224, 224)
+    assert pixel_values.dtype == torch.float32
+    assert pixel_values.device.type == 'cpu'
+    # Wrapper is callable and returns the inner model's logits tensor (not a SimpleNamespace).
+    out = wrapped(pixel_values)
+    assert isinstance(out, torch.Tensor)
+
+
+def test_build_vision_export_inputs_custom_channels_and_size(exporter):
+    """Non-default num_channels / scalar image_size are honored."""
+    model = _TinyVisionModel(num_channels=1, image_size=384)
+
+    _, args, _, _ = exporter._build_vision_export_inputs(model, batch_size=4, model_dtype=torch.float32, device='cpu')
+    pixel_values = args[0]
+    assert tuple(pixel_values.shape) == (4, 1, 384, 384)
+
+
+def test_build_vision_export_inputs_tuple_image_size(exporter):
+    """Tuple/list ``image_size`` is unpacked as (H, W)."""
+    model = _TinyVisionModel(num_channels=3)
+    model.config.image_size = (192, 384)
+
+    _, args, _, _ = exporter._build_vision_export_inputs(model, batch_size=1, model_dtype=torch.float32, device='cpu')
+    assert tuple(args[0].shape) == (1, 3, 192, 384)
+
+
+def test_build_vision_export_inputs_wrapper_handles_last_hidden_state(exporter):
+    """The wrapper falls back to ``last_hidden_state`` when ``logits`` is absent."""
+    model = _TinyVisionModel()
+    # Override forward to return only last_hidden_state.
+    hidden = torch.zeros(2, 4)
+
+    class _ModelOnlyHidden(torch.nn.Module):
+        main_input_name = 'pixel_values'
+
+        def __init__(self):
+            super().__init__()
+            self.config = SimpleNamespace(num_channels=3, image_size=8)
+            self.dummy = torch.nn.Linear(1, 1)
+
+        def forward(self, pixel_values):
+            return SimpleNamespace(last_hidden_state=hidden)
+
+    custom = _ModelOnlyHidden()
+    wrapped, args, _, _ = exporter._build_vision_export_inputs(
+        custom, batch_size=2, model_dtype=torch.float32, device='cpu'
+    )
+    out = wrapped(args[0])
+    assert torch.equal(out, hidden)
+    _ = model    # keep fixture-ish ref
+
+
+def test_build_vision_export_inputs_wrapper_handles_tuple_output(exporter):
+    """The wrapper returns ``outputs[0]`` when the model emits a tuple."""
+
+    class _TupleModel(torch.nn.Module):
+        main_input_name = 'pixel_values'
+
+        def __init__(self):
+            super().__init__()
+            self.config = SimpleNamespace(num_channels=3, image_size=8)
+            self.dummy = torch.nn.Linear(1, 1)
+
+        def forward(self, pixel_values):
+            return (torch.ones(pixel_values.size(0), 2), torch.zeros(1))
+
+    wrapped, args, _, _ = exporter._build_vision_export_inputs(
+        _TupleModel(), batch_size=3, model_dtype=torch.float32, device='cpu'
+    )
+    out = wrapped(args[0])
+    assert tuple(out.shape) == (3, 2)
+
+
+# ---------------------------------------------------------------------------
+# _build_nlp_export_inputs
+# ---------------------------------------------------------------------------
+
+
+def test_build_nlp_export_inputs_basic(exporter):
+    """NLP path emits int64 ``input_ids`` + ``attention_mask`` of shape (B, S)."""
+    model = _TinyNLPModel()
+
+    wrapped, args, names, axes = exporter._build_nlp_export_inputs(model, batch_size=2, seq_length=16, device='cpu')
+
+    assert names == ['input_ids', 'attention_mask']
+    # Dynamic axes: batch_size + seq_length on both inputs and the output.
+    assert axes['input_ids'] == {0: 'batch_size', 1: 'seq_length'}
+    assert axes['attention_mask'] == {0: 'batch_size', 1: 'seq_length'}
+    assert axes['output'] == {0: 'batch_size', 1: 'seq_length'}
+    assert len(args) == 2
+    input_ids, attention_mask = args
+    assert tuple(input_ids.shape) == (2, 16)
+    assert tuple(attention_mask.shape) == (2, 16)
+    assert input_ids.dtype == torch.int64
+    assert attention_mask.dtype == torch.int64
+    # All ones ⇒ token id 1 is within the embedding's vocab.
+    assert torch.all(input_ids == 1)
+    # Wrapper runs the inner model and unwraps last_hidden_state.
+    out = wrapped(input_ids, attention_mask)
+    assert isinstance(out, torch.Tensor)
+    assert tuple(out.shape) == (2, 16, 8)
+
+
+def test_build_nlp_export_inputs_wrapper_handles_logits(exporter):
+    """When the inner model exposes ``logits``, the wrapper returns those."""
+
+    class _LogitsModel(torch.nn.Module):
+        main_input_name = 'input_ids'
+
+        def __init__(self):
+            super().__init__()
+            self.config = SimpleNamespace()
+            self.embed = torch.nn.Embedding(8, 4)
+
+        def forward(self, input_ids, attention_mask):
+            return SimpleNamespace(logits=self.embed(input_ids))
+
+    wrapped, args, _, _ = exporter._build_nlp_export_inputs(_LogitsModel(), batch_size=1, seq_length=4, device='cpu')
+    out = wrapped(*args)
+    assert isinstance(out, torch.Tensor)
+    assert tuple(out.shape) == (1, 4, 4)
+
+
+def test_build_nlp_export_inputs_wrapper_handles_tuple(exporter):
+    """The wrapper returns ``outputs[0]`` when the model emits a tuple."""
+
+    class _TupleNLP(torch.nn.Module):
+        main_input_name = 'input_ids'
+
+        def __init__(self):
+            super().__init__()
+            self.config = SimpleNamespace()
+            self.embed = torch.nn.Embedding(8, 4)
+
+        def forward(self, input_ids, attention_mask):
+            return (self.embed(input_ids), torch.zeros(1))
+
+    wrapped, args, _, _ = exporter._build_nlp_export_inputs(_TupleNLP(), batch_size=1, seq_length=2, device='cpu')
+    out = wrapped(*args)
+    assert tuple(out.shape) == (1, 2, 4)
+
+
+# ---------------------------------------------------------------------------
+# _build_onnx_export_kwargs
+# ---------------------------------------------------------------------------
+
+
+def test_build_onnx_export_kwargs_small_model_no_external_data(exporter):
+    """Small models (< 2GB) do not request external-data format."""
+    model = _TinyNLPModel()
+    input_names = ['input_ids', 'attention_mask']
+    dynamic_axes = {'input_ids': {0: 'b'}}
+
+    kwargs = exporter._build_onnx_export_kwargs(model, input_names, dynamic_axes)
+
+    assert kwargs['opset_version'] == 14
+    assert kwargs['do_constant_folding'] is True
+    assert kwargs['input_names'] == input_names
+    assert kwargs['output_names'] == ['output']
+    assert kwargs['dynamic_axes'] is dynamic_axes
+    assert 'external_data' not in kwargs
+    assert 'use_external_data_format' not in kwargs
+
+
+def test_build_onnx_export_kwargs_large_model_uses_external_data_modern(exporter):
+    """For >2GB models on PyTorch with ``external_data`` param, that key is used."""
+    fake_model = MagicMock()
+    # 3 GB worth of fp32 params = 3 * (1024**3) / 4 numel.
+    big_param = SimpleNamespace(
+        numel=lambda: int(3 * (1024**3) / 4),
+        element_size=lambda: 4,
+    )
+    fake_model.parameters.return_value = [big_param]
+
+    fake_sig = inspect.Signature(
+        parameters=[inspect.Parameter('external_data', inspect.Parameter.POSITIONAL_OR_KEYWORD)]
+    )
+    with patch(f'{_EXPORTER_MODULE}.inspect.signature', return_value=fake_sig):
+        kwargs = exporter._build_onnx_export_kwargs(fake_model, ['input_ids'], {})
+
+    assert kwargs['external_data'] is True
+    assert 'use_external_data_format' not in kwargs
+
+
+def test_build_onnx_export_kwargs_large_model_uses_external_data_legacy(exporter):
+    """For >2GB models on older PyTorch, ``use_external_data_format`` is used instead."""
+    fake_model = MagicMock()
+    big_param = SimpleNamespace(
+        numel=lambda: int(3 * (1024**3) / 4),
+        element_size=lambda: 4,
+    )
+    fake_model.parameters.return_value = [big_param]
+
+    fake_sig = inspect.Signature(
+        parameters=[inspect.Parameter('use_external_data_format', inspect.Parameter.POSITIONAL_OR_KEYWORD)]
+    )
+    with patch(f'{_EXPORTER_MODULE}.inspect.signature', return_value=fake_sig):
+        kwargs = exporter._build_onnx_export_kwargs(fake_model, ['input_ids'], {})
+
+    assert kwargs['use_external_data_format'] is True
+    assert 'external_data' not in kwargs
+
+
+# ---------------------------------------------------------------------------
+# export_huggingface_model
+# ---------------------------------------------------------------------------
+
+
+def test_export_huggingface_model_vision_routes_to_vision_helper(exporter, tmp_path):
+    """Vision model (main_input_name='pixel_values') uses the vision helper."""
+    model = _TinyVisionModel(num_channels=3, image_size=32)
+
+    captured = {}
+
+    def fake_export(wrapped_model, args, file_name, **kwargs):
+        captured['wrapped_model'] = wrapped_model
+        captured['args'] = args
+        captured['file_name'] = file_name
+        captured['kwargs'] = kwargs
+        Path(file_name).touch()
+
+    with patch(f'{_EXPORTER_MODULE}.torch.onnx.export', side_effect=fake_export):
+        result = exporter.export_huggingface_model(
+            model=model,
+            model_name='vit-tiny',
+            batch_size=2,
+            seq_length=16,
+            output_dir=str(tmp_path),
+        )
+
+    assert result == str(tmp_path / 'vit-tiny.onnx')
+    # Vision shape: (B, C, H, W) tuple of length 1.
+    assert len(captured['args']) == 1
+    assert tuple(captured['args'][0].shape) == (2, 3, 32, 32)
+    assert captured['kwargs']['input_names'] == ['pixel_values']
+    assert captured['kwargs']['opset_version'] == 14
+    # use_cache disabled to avoid DynamicCache issues.
+    assert model.config.use_cache is False
+
+
+def test_export_huggingface_model_nlp_routes_to_nlp_helper(exporter, tmp_path):
+    """NLP model (main_input_name='input_ids') uses the NLP helper."""
+    model = _TinyNLPModel()
+
+    captured = {}
+
+    def fake_export(wrapped_model, args, file_name, **kwargs):
+        captured['args'] = args
+        captured['file_name'] = file_name
+        captured['kwargs'] = kwargs
+        Path(file_name).touch()
+
+    with patch(f'{_EXPORTER_MODULE}.torch.onnx.export', side_effect=fake_export):
+        result = exporter.export_huggingface_model(
+            model=model,
+            model_name='bert-tiny',
+            batch_size=2,
+            seq_length=8,
+            output_dir=str(tmp_path),
+        )
+
+    assert result == str(tmp_path / 'bert-tiny.onnx')
+    assert len(captured['args']) == 2
+    input_ids, attention_mask = captured['args']
+    assert tuple(input_ids.shape) == (2, 8)
+    assert tuple(attention_mask.shape) == (2, 8)
+    assert captured['kwargs']['input_names'] == ['input_ids', 'attention_mask']
+
+
+def test_export_huggingface_model_default_output_dir(exporter):
+    """When ``output_dir`` is None, the exporter writes under self._onnx_model_path."""
+    model = _TinyNLPModel()
+
+    written = {}
+
+    def fake_export(wrapped_model, args, file_name, **kwargs):
+        written['file_name'] = file_name
+        Path(file_name).touch()
+
+    with patch(f'{_EXPORTER_MODULE}.torch.onnx.export', side_effect=fake_export):
+        result = exporter.export_huggingface_model(model=model, model_name='bert-default')
+
+    expected = str(exporter._onnx_model_path / 'bert-default.onnx')
+    assert result == expected
+    assert written['file_name'] == expected
+
+
+def test_export_huggingface_model_handles_export_failure(exporter, tmp_path):
+    """If ``torch.onnx.export`` raises, the helper returns '' and logs the error."""
+    model = _TinyNLPModel()
+
+    with patch(f'{_EXPORTER_MODULE}.torch.onnx.export', side_effect=RuntimeError('boom')):
+        result = exporter.export_huggingface_model(
+            model=model,
+            model_name='bert-fail',
+            batch_size=1,
+            seq_length=4,
+            output_dir=str(tmp_path),
+        )
+
+    assert result == ''
+
+
+def test_export_huggingface_model_disables_use_cache(exporter, tmp_path):
+    """``model.config.use_cache`` is forced to False before export."""
+    model = _TinyNLPModel()
+    model.config.use_cache = True
+
+    with patch(f'{_EXPORTER_MODULE}.torch.onnx.export') as mock_export:
+        mock_export.side_effect = lambda *a, **kw: Path(a[2]).touch()
+        exporter.export_huggingface_model(
+            model=model,
+            model_name='bert-cache',
+            batch_size=1,
+            seq_length=4,
+            output_dir=str(tmp_path),
+        )
+
+    assert model.config.use_cache is False
+
+
+def test_export_huggingface_model_default_main_input_name_is_nlp(exporter, tmp_path):
+    """Models without ``main_input_name`` default to the NLP path."""
+
+    class _NoMainInput(torch.nn.Module):
+        # Intentionally no main_input_name attribute.
+        def __init__(self):
+            super().__init__()
+            self.config = SimpleNamespace(use_cache=False)
+            self.embed = torch.nn.Embedding(8, 4)
+
+        def forward(self, input_ids, attention_mask):
+            return SimpleNamespace(last_hidden_state=self.embed(input_ids))
+
+    captured = {}
+
+    def fake_export(wrapped_model, args, file_name, **kwargs):
+        captured['kwargs'] = kwargs
+        Path(file_name).touch()
+
+    with patch(f'{_EXPORTER_MODULE}.torch.onnx.export', side_effect=fake_export):
+        result = exporter.export_huggingface_model(
+            model=_NoMainInput(),
+            model_name='no-main',
+            batch_size=1,
+            seq_length=4,
+            output_dir=str(tmp_path),
+        )
+
+    assert result == str(tmp_path / 'no-main.onnx')
+    assert captured['kwargs']['input_names'] == ['input_ids', 'attention_mask']
diff --git a/tests/benchmarks/micro_benchmarks/test_ort_inference_performance.py b/tests/benchmarks/micro_benchmarks/test_ort_inference_performance.py
index 9d9d1b0db..1c20e9b11 100644
--- a/tests/benchmarks/micro_benchmarks/test_ort_inference_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_ort_inference_performance.py
@@ -5,14 +5,18 @@
 
 import shutil
 from pathlib import Path
+from types import SimpleNamespace
 from unittest import mock
+from unittest.mock import MagicMock, patch
 
+import pytest
 import torch
 import torchvision.models
 
 from tests.helper import decorator
 from superbench.benchmarks import BenchmarkRegistry, Platform, Precision, BenchmarkType, ReturnCode
 from superbench.benchmarks.micro_benchmarks.ort_inference_performance import ORTInferenceBenchmark
+from superbench.benchmarks.result import BenchmarkResult
 
 
 @decorator.cuda_test
@@ -66,3 +70,305 @@ def test_ort_inference_performance(mock_ort_session_run, mock_get_dir):
         metric = '{}_{}_time'.format(precision, model)
         assert (metric in benchmark.result)
         assert (metric in benchmark.raw_data)
+
+
+# ---------------------------------------------------------------------------
+# HuggingFace-path coverage for _preprocess_huggingface_models and
+# _export_hf_model_to_onnx. These tests are pure unit tests with no CUDA / no
+# HF network access; the model loader, ModelSourceConfig, and torch2onnxExporter
+# are all mocked to keep the suite fast and deterministic.
+# ---------------------------------------------------------------------------
+
+_ORT_MODULE = 'superbench.benchmarks.micro_benchmarks.ort_inference_performance'
+
+
+def _make_ort_benchmark(**arg_overrides):
+    """Build an ORTInferenceBenchmark and minimally initialise its mutable state.
+
+    Returns the benchmark with ``_args``, ``_result``, and the name-mangled
+    cache-path attribute populated so HF-path methods can be exercised in
+    isolation without going through the full ``_preprocess`` pipeline.
+    """
+    benchmark = ORTInferenceBenchmark('ort-inference', parameters='')
+    benchmark._result = BenchmarkResult('ort-inference', BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
+    defaults = dict(
+        model_source='huggingface',
+        model_identifier='prajjwal1/bert-tiny',
+        allow_remote_code=False,
+        precision=Precision.FLOAT16,
+        batch_size=8,
+        seq_length=128,
+        graph_opt_level=3,
+        num_warmup=1,
+        num_steps=1,
+        pytorch_models=[],
+        require_cuda=False,
+        log_raw_data=False,
+    )
+    defaults.update(arg_overrides)
+    benchmark._args = SimpleNamespace(**defaults)
+    # The HF helpers reference the name-mangled cache path; set it explicitly so
+    # we don't depend on torch.hub.get_dir() in unit tests.
+    benchmark._ORTInferenceBenchmark__model_cache_path = Path('/tmp/sb-ort-test-cache')
+    return benchmark
+
+
+# ---------------------------------------------------------------------------
+# _preprocess_huggingface_models
+# ---------------------------------------------------------------------------
+
+
+def test_preprocess_hf_missing_model_identifier():
+    """Missing --model_identifier is rejected before any HF I/O."""
+    benchmark = _make_ort_benchmark(model_identifier=None)
+
+    assert benchmark._preprocess_huggingface_models() is False
+    assert benchmark.return_code == ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE
+
+
+def test_preprocess_hf_invalid_identifier():
+    """Path-like / unsafe identifier is rejected by validate_model_identifier."""
+    benchmark = _make_ort_benchmark(model_identifier='../etc/passwd')
+
+    assert benchmark._preprocess_huggingface_models() is False
+    assert benchmark.return_code == ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE
+
+
+def test_preprocess_hf_memory_check_fails():
+    """check_memory_fits=False short-circuits with EXECUTION_FAILURE."""
+    benchmark = _make_ort_benchmark()
+
+    with patch('transformers.AutoConfig') as mock_auto_config, \
+            patch(f'{_ORT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls:
+        mock_auto_config.from_pretrained.return_value = MagicMock(name='hf_config')
+        mock_loader_cls.check_memory_fits.return_value = (False, 1000.0, 30.0, 16.0)
+
+        assert benchmark._preprocess_huggingface_models() is False
+
+    assert benchmark.return_code == ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE
+    mock_auto_config.from_pretrained.assert_called_once()
+
+
+def test_preprocess_hf_auto_config_exception():
+    """An exception while downloading the config is converted to failure."""
+    benchmark = _make_ort_benchmark()
+
+    with patch('transformers.AutoConfig') as mock_auto_config:
+        mock_auto_config.from_pretrained.side_effect = RuntimeError('boom')
+
+        assert benchmark._preprocess_huggingface_models() is False
+
+    assert benchmark.return_code == ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE
+
+
+def test_preprocess_hf_happy_path_delegates_to_export():
+    """Happy path: config + memory check pass and the export helper runs."""
+    benchmark = _make_ort_benchmark()
+
+    fake_hf_config = MagicMock(name='hf_config')
+    with patch('transformers.AutoConfig') as mock_auto_config, \
+            patch(f'{_ORT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls, \
+            patch.object(benchmark, '_export_hf_model_to_onnx', return_value=True) as mock_export:
+        mock_auto_config.from_pretrained.return_value = fake_hf_config
+        mock_loader_cls.check_memory_fits.return_value = (True, 4.0, 0.02, 16.0)
+
+        assert benchmark._preprocess_huggingface_models() is True
+
+    # AutoConfig is called with trust_remote_code matching --allow_remote_code (False).
+    config_kwargs = mock_auto_config.from_pretrained.call_args.kwargs
+    assert config_kwargs['trust_remote_code'] is False
+    # _hf_config is stashed for __inference() to read vocab_size later.
+    assert benchmark._hf_config is fake_hf_config
+    # Memory check uses the runtime precision (float16 here).
+    mem_args, mem_kwargs = mock_loader_cls.check_memory_fits.call_args
+    assert mem_args[2] == 'float16'
+    assert mem_kwargs.get('mode') == 'inference'
+    # Export helper receives the pre-downloaded config to avoid a redundant fetch.
+    export_args, _ = mock_export.call_args
+    assert export_args[2] is fake_hf_config
+
+
+def test_preprocess_hf_int8_uses_float32_for_memory_check():
+    """INT8 precision still does the memory check against float32 weights."""
+    benchmark = _make_ort_benchmark(precision=Precision.INT8)
+
+    with patch('transformers.AutoConfig') as mock_auto_config, \
+            patch(f'{_ORT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls, \
+            patch.object(benchmark, '_export_hf_model_to_onnx', return_value=True):
+        mock_auto_config.from_pretrained.return_value = MagicMock()
+        mock_loader_cls.check_memory_fits.return_value = (True, 1.0, 0.01, 16.0)
+
+        assert benchmark._preprocess_huggingface_models() is True
+
+    mem_args, _ = mock_loader_cls.check_memory_fits.call_args
+    assert mem_args[2] == 'float32'
+
+
+def test_preprocess_hf_allow_remote_code_propagates():
+    """--allow_remote_code is forwarded as trust_remote_code=True to AutoConfig."""
+    benchmark = _make_ort_benchmark(allow_remote_code=True)
+
+    with patch('transformers.AutoConfig') as mock_auto_config, \
+            patch(f'{_ORT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls, \
+            patch.object(benchmark, '_export_hf_model_to_onnx', return_value=True):
+        mock_auto_config.from_pretrained.return_value = MagicMock()
+        mock_loader_cls.check_memory_fits.return_value = (True, 1.0, 0.01, 16.0)
+
+        benchmark._preprocess_huggingface_models()
+
+    assert mock_auto_config.from_pretrained.call_args.kwargs['trust_remote_code'] is True
+
+
+# ---------------------------------------------------------------------------
+# _export_hf_model_to_onnx
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def mock_export_dependencies(tmp_path):
+    """Patch the loader, ModelSourceConfig, exporter, and torch.cuda for export tests.
+
+    Yields a SimpleNamespace bundle of mock handles plus the exporter's resolved
+    ONNX output path, so each test can assert on whichever it needs.
+    """
+    rank_dir = tmp_path / 'checkpoints'
+
+    with patch(f'{_ORT_MODULE}.HuggingFaceModelLoader') as loader_cls, \
+            patch(f'{_ORT_MODULE}.ModelSourceConfig') as msc, \
+            patch(f'{_ORT_MODULE}.torch.cuda') as torch_cuda:
+        loader = MagicMock()
+        loader_cls.return_value = loader
+        loader.load_model_from_config.return_value = (MagicMock(name='hf_model'), MagicMock(), None)
+        torch_cuda.is_available.return_value = False
+
+        # Patch the exporter where it is imported (inside _export_hf_model_to_onnx).
+        with patch('superbench.benchmarks.micro_benchmarks._export_torch_to_onnx.torch2onnxExporter') as exporter_cls:
+            exporter = MagicMock()
+            exporter_cls.return_value = exporter
+
+            def _fake_export(model, model_name, batch_size, seq_length, output_dir):
+                """Simulate a successful ONNX export by writing the file the exporter would produce."""
+                out = Path(output_dir) / f'{model_name}.onnx'
+                out.parent.mkdir(parents=True, exist_ok=True)
+                out.touch()
+                return str(out)
+
+            exporter.export_huggingface_model.side_effect = _fake_export
+
+            yield SimpleNamespace(
+                loader_cls=loader_cls,
+                loader=loader,
+                msc=msc,
+                exporter_cls=exporter_cls,
+                exporter=exporter,
+                rank_dir=rank_dir,
+            )
+
+
+def test_export_hf_model_to_onnx_fp16_success(mock_export_dependencies, tmp_path):
+    """fp16 path: ModelSourceConfig dtype=float16, exporter writes ONNX, no quantization."""
+    benchmark = _make_ort_benchmark(precision=Precision.FLOAT16)
+    benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints'
+
+    with patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0'}, clear=False):
+        ok = benchmark._export_hf_model_to_onnx(hf_token='abc', allow_remote_code=False, hf_config=MagicMock())
+
+    assert ok is True
+    # ModelSourceConfig built with float16 (precision dtype) and device_map=None.
+    msc_kwargs = mock_export_dependencies.msc.call_args.kwargs
+    assert msc_kwargs['torch_dtype'] == 'float16'
+    assert msc_kwargs['device_map'] is None
+    assert msc_kwargs['hf_token'] == 'abc'
+    # load_model_from_config is invoked with the pre-downloaded config to skip a redundant fetch.
+    load_kwargs = mock_export_dependencies.loader.load_model_from_config.call_args.kwargs
+    assert load_kwargs['device'] == 'cpu'
+    assert load_kwargs['config_pretrained'] is not None
+    # Exporter receives precision-tagged model name and the rank-scoped output dir.
+    export_kwargs = mock_export_dependencies.exporter.export_huggingface_model.call_args.kwargs
+    assert export_kwargs['model_name'] == 'prajjwal1_bert-tiny.float16'
+    assert export_kwargs['output_dir'].endswith('rank_0')
+    assert export_kwargs['batch_size'] == 8
+    assert export_kwargs['seq_length'] == 128
+    # pytorch_models is rewritten to the bare HF id (no precision suffix).
+    assert benchmark._args.pytorch_models == ['prajjwal1_bert-tiny']
+    # Cache path now points at the rank subdirectory.
+    assert str(benchmark._ORTInferenceBenchmark__model_cache_path).endswith('rank_0')
+
+
+def test_export_hf_model_to_onnx_int8_invokes_quantize(mock_export_dependencies, tmp_path):
+    """INT8 path: ONNX is exported as float32 first, then quantize_dynamic is called."""
+    benchmark = _make_ort_benchmark(precision=Precision.INT8)
+    benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints'
+
+    fake_quantize_module = MagicMock()
+    with patch.dict('sys.modules', {'onnxruntime.quantization': fake_quantize_module}), \
+            patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0'}, clear=False):
+        ok = benchmark._export_hf_model_to_onnx(hf_token=None, allow_remote_code=False, hf_config=MagicMock())
+
+    assert ok is True
+    # ModelSourceConfig dtype is float32 because INT8 is generated post-export.
+    msc_kwargs = mock_export_dependencies.msc.call_args.kwargs
+    assert msc_kwargs['torch_dtype'] == 'float32'
+    # Exporter wrote the float32 ONNX, then quantize_dynamic was called with that file.
+    export_kwargs = mock_export_dependencies.exporter.export_huggingface_model.call_args.kwargs
+    assert export_kwargs['model_name'] == 'prajjwal1_bert-tiny.float32'
+    fake_quantize_module.quantize_dynamic.assert_called_once()
+    quantize_args = fake_quantize_module.quantize_dynamic.call_args.args
+    assert quantize_args[0].endswith('prajjwal1_bert-tiny.float32.onnx')
+    assert quantize_args[1].endswith('prajjwal1_bert-tiny.int8.onnx')
+
+
+def test_export_hf_model_to_onnx_export_failure(mock_export_dependencies, tmp_path):
+    """If exporter returns falsy, the helper fails without touching pytorch_models."""
+    benchmark = _make_ort_benchmark()
+    benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints'
+    mock_export_dependencies.exporter.export_huggingface_model.side_effect = None
+    mock_export_dependencies.exporter.export_huggingface_model.return_value = None
+
+    with patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0'}, clear=False):
+        ok = benchmark._export_hf_model_to_onnx(hf_token=None, allow_remote_code=False, hf_config=MagicMock())
+
+    assert ok is False
+    assert benchmark.return_code == ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE
+    assert benchmark._args.pytorch_models == []
+
+
+def test_export_hf_model_to_onnx_uses_proc_rank_env(mock_export_dependencies, tmp_path):
+    """PROC_RANK env var (or CUDA_VISIBLE_DEVICES) controls the rank subdirectory."""
+    benchmark = _make_ort_benchmark()
+    benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints'
+
+    with patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0', 'PROC_RANK': '7'}, clear=False):
+        ok = benchmark._export_hf_model_to_onnx(hf_token=None, allow_remote_code=False, hf_config=MagicMock())
+
+    assert ok is True
+    export_kwargs = mock_export_dependencies.exporter.export_huggingface_model.call_args.kwargs
+    assert export_kwargs['output_dir'].endswith('rank_7')
+    assert str(benchmark._ORTInferenceBenchmark__model_cache_path).endswith('rank_7')
+
+
+def test_export_hf_model_to_onnx_passes_allow_remote_code_to_loader(mock_export_dependencies, tmp_path):
+    """allow_remote_code is forwarded to the HuggingFaceModelLoader constructor."""
+    benchmark = _make_ort_benchmark()
+    benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints'
+
+    with patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0'}, clear=False):
+        benchmark._export_hf_model_to_onnx(hf_token=None, allow_remote_code=True, hf_config=MagicMock())
+
+    loader_kwargs = mock_export_dependencies.loader_cls.call_args.kwargs
+    assert loader_kwargs['allow_remote_code'] is True
+
+
+def test_export_hf_model_to_onnx_releases_cuda_cache(mock_export_dependencies, tmp_path):
+    """When CUDA is available, torch.cuda.empty_cache() is invoked after export."""
+    benchmark = _make_ort_benchmark()
+    benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints'
+
+    with patch(f'{_ORT_MODULE}.torch.cuda') as torch_cuda, \
+            patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0'}, clear=False):
+        torch_cuda.is_available.return_value = True
+
+        ok = benchmark._export_hf_model_to_onnx(hf_token=None, allow_remote_code=False, hf_config=MagicMock())
+
+    assert ok is True
+    torch_cuda.empty_cache.assert_called_once()
diff --git a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
index 441be7af1..6af16dd41 100644
--- a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
@@ -27,9 +27,7 @@ def _make_onnx_input(name, dims):
     """
     return SimpleNamespace(
         name=name,
-        type=SimpleNamespace(
-            tensor_type=SimpleNamespace(shape=SimpleNamespace(dim=[_make_onnx_dim(d) for d in dims]))
-        ),
+        type=SimpleNamespace(tensor_type=SimpleNamespace(shape=SimpleNamespace(dim=[_make_onnx_dim(d) for d in dims]))),
     )
 
 
@@ -41,6 +39,7 @@ def _make_onnx_model(inputs, initializer_names=()):
 
 class TensorRTInferenceBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase):
     """Class for tensorrt-inferencee benchmark test cases."""
+
     @classmethod
     def setUpClass(cls):
         """Hook method for setting up class fixture before running tests in the class."""
@@ -195,13 +194,9 @@ def _make_benchmark(self, **arg_overrides):
         workspace flag already resolved) without actually invoking trtexec or
         touching the filesystem.
         """
-        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
-            self.benchmark_name, Platform.CUDA
-        )
+        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
         benchmark = benchmark_cls(self.benchmark_name, parameters='')
-        benchmark._result = BenchmarkResult(
-            self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1
-        )
+        benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
         defaults = dict(
             model_source='huggingface',
             model_identifier='prajjwal1/bert-tiny',