From 2ca3e68398a8a0505fc2d0c2d5e02f259f058b42 Mon Sep 17 00:00:00 2001 From: Aishwarya-Tonpe Date: Wed, 29 Apr 2026 23:37:02 +0000 Subject: [PATCH 01/12] feat: Add HuggingFace Hub model support for ORT and TensorRT inference benchmarks - Add HuggingFaceModelLoader for downloading and caching models from HF Hub - Support both NLP (AutoModelForCausalLM) and vision (AutoModelForImageClassification) models - Add model_source and model_identifier parameters to TensorRT/ORT benchmarks - Add ONNX export pipeline for HuggingFace models with dynamic axes - Derive vision input shapes from ONNX graph dims with HF config fallback - Filter ONNX initializers from graph.input for correct NLP input handling - Add PyTorch 2.8+ compatibility (external_data vs use_external_data_format) - Add example script, unit tests, and config schema updates - Support HF_TOKEN env var for gated model access --- .../benchmarks/ort_inference_performance.py | 73 ++- .../tensorrt_inference_performance.py | 80 +++- .../micro_benchmarks/_export_torch_to_onnx.py | 168 ++++++- .../huggingface_model_loader.py | 429 ++++++++++++++++++ .../micro_benchmarks/model_source_config.py | 89 ++++ .../ort_inference_performance.py | 164 ++++++- .../tensorrt_inference_performance.py | 185 +++++++- .../micro_benchmarks/test_huggingface_e2e.py | 103 +++++ .../test_huggingface_loader.py | 117 +++++ .../test_model_source_config.py | 73 +++ tests/helper/decorator.py | 1 + 11 files changed, 1467 insertions(+), 15 deletions(-) create mode 100644 superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py create mode 100644 superbench/benchmarks/micro_benchmarks/model_source_config.py create mode 100644 tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py create mode 100644 tests/benchmarks/micro_benchmarks/test_huggingface_loader.py create mode 100644 tests/benchmarks/micro_benchmarks/test_model_source_config.py diff --git a/examples/benchmarks/ort_inference_performance.py b/examples/benchmarks/ort_inference_performance.py index 18bda2043..82cd6dec0 100644 --- a/examples/benchmarks/ort_inference_performance.py +++ b/examples/benchmarks/ort_inference_performance.py @@ -4,13 +4,30 @@ """Micro benchmark example for ONNXRuntime inference performance. Commands to run: + In-house models: python3 examples/benchmarks/ort_inference_performance.py + python3 examples/benchmarks/ort_inference_performance.py --model_source in-house + + HuggingFace models: + python3 examples/benchmarks/ort_inference_performance.py \ + --model_source huggingface --model_identifier bert-base-uncased + python3 examples/benchmarks/ort_inference_performance.py \ + --model_source huggingface --model_identifier microsoft/resnet-50 + python3 examples/benchmarks/ort_inference_performance.py \ + --model_source huggingface --model_identifier deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + +Environment variables: + HF_TOKEN: HuggingFace token for gated models (optional) """ +import argparse + from superbench.benchmarks import BenchmarkRegistry, Platform from superbench.common.utils import logger -if __name__ == '__main__': + +def run_inhouse_benchmark(): + """Run ORT inference with in-house torchvision models.""" context = BenchmarkRegistry.create_benchmark_context( 'ort-inference', platform=Platform.CUDA, parameters='--pytorch_models resnet50 resnet101 --precision float16' ) @@ -21,3 +38,57 @@ benchmark.name, benchmark.return_code, benchmark.result ) ) + return benchmark + + +def run_huggingface_benchmark(model_identifier, precision='float16', batch_size=32, seq_length=512): + """Run ORT inference with a HuggingFace model. + + Args: + model_identifier: HuggingFace model ID (e.g., 'bert-base-uncased'). + precision: Inference precision ('float32', 'float16', 'int8'). + batch_size: Batch size for inference. + seq_length: Sequence length for transformer models. + """ + parameters = ( + f'--model_source huggingface ' + f'--model_identifier {model_identifier} ' + f'--precision {precision} ' + f'--batch_size {batch_size} ' + f'--seq_length {seq_length}' + ) + + logger.info(f'Running ORT inference benchmark with HuggingFace model: {model_identifier}') + + context = BenchmarkRegistry.create_benchmark_context('ort-inference', platform=Platform.CUDA, parameters=parameters) + benchmark = BenchmarkRegistry.launch_benchmark(context) + if benchmark: + logger.info( + 'benchmark: {}, return code: {}, result: {}'.format( + benchmark.name, benchmark.return_code, benchmark.result + ) + ) + return benchmark + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='ORT inference benchmark') + parser.add_argument( + '--model_source', + type=str, + default='in-house', + choices=['in-house', 'huggingface'], + help='Source of the model: in-house (default) or huggingface' + ) + parser.add_argument( + '--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier' + ) + parser.add_argument('--precision', type=str, default='float16', choices=['float32', 'float16', 'int8']) + parser.add_argument('--batch_size', type=int, default=32) + parser.add_argument('--seq_length', type=int, default=512) + args = parser.parse_args() + + if args.model_source == 'huggingface': + run_huggingface_benchmark(args.model_identifier, args.precision, args.batch_size, args.seq_length) + else: + run_inhouse_benchmark() diff --git a/examples/benchmarks/tensorrt_inference_performance.py b/examples/benchmarks/tensorrt_inference_performance.py index cacbf1177..4385a728e 100644 --- a/examples/benchmarks/tensorrt_inference_performance.py +++ b/examples/benchmarks/tensorrt_inference_performance.py @@ -4,13 +4,30 @@ """Micro benchmark example for TensorRT inference performance. Commands to run: + In-house models: python3 examples/benchmarks/tensorrt_inference_performance.py + python3 examples/benchmarks/tensorrt_inference_performance.py --model_source in-house + + HuggingFace models: + python3 examples/benchmarks/tensorrt_inference_performance.py \ + --model_source huggingface --model_identifier bert-base-uncased + python3 examples/benchmarks/tensorrt_inference_performance.py \ + --model_source huggingface --model_identifier microsoft/resnet-50 + python3 examples/benchmarks/tensorrt_inference_performance.py \ + --model_source huggingface --model_identifier deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + +Environment variables: + HF_TOKEN: HuggingFace token for gated models (optional) """ +import argparse + from superbench.benchmarks import BenchmarkRegistry, Platform from superbench.common.utils import logger -if __name__ == '__main__': + +def run_inhouse_benchmark(): + """Run TensorRT inference with in-house torchvision models.""" context = BenchmarkRegistry.create_benchmark_context('tensorrt-inference', platform=Platform.CUDA) benchmark = BenchmarkRegistry.launch_benchmark(context) if benchmark: @@ -19,3 +36,64 @@ benchmark.name, benchmark.return_code, benchmark.result ) ) + return benchmark + + +def run_huggingface_benchmark(model_identifier, precision='fp16', batch_size=32, seq_length=512, iterations=2048): + """Run TensorRT inference with a HuggingFace model. + + Args: + model_identifier: HuggingFace model ID (e.g., 'bert-base-uncased'). + precision: Inference precision ('fp32', 'fp16', 'int8'). + batch_size: Batch size for inference. + seq_length: Sequence length for transformer models. + iterations: Number of inference iterations. + """ + parameters = ( + f'--model_source huggingface ' + f'--model_identifier {model_identifier} ' + f'--precision {precision} ' + f'--batch_size {batch_size} ' + f'--seq_length {seq_length} ' + f'--iterations {iterations}' + ) + + logger.info(f'Running TensorRT inference benchmark with HuggingFace model: {model_identifier}') + + context = BenchmarkRegistry.create_benchmark_context( + 'tensorrt-inference', platform=Platform.CUDA, parameters=parameters + ) + benchmark = BenchmarkRegistry.launch_benchmark(context) + if benchmark: + logger.info( + 'benchmark: {}, return code: {}, result: {}'.format( + benchmark.name, benchmark.return_code, benchmark.result + ) + ) + return benchmark + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='TensorRT inference benchmark') + parser.add_argument( + '--model_source', + type=str, + default='in-house', + choices=['in-house', 'huggingface'], + help='Source of the model: in-house (default) or huggingface' + ) + parser.add_argument( + '--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier' + ) + parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'int8']) + parser.add_argument('--batch_size', type=int, default=32) + parser.add_argument('--seq_length', type=int, default=512) + parser.add_argument('--iterations', type=int, default=2048) + args = parser.parse_args() + + if args.model_source == 'huggingface': + run_huggingface_benchmark( + args.model_identifier, args.precision, args.batch_size, args.seq_length, args.iterations + ) + else: + run_inhouse_benchmark() diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py index 876d2ccfe..ab94f74e7 100644 --- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py +++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py @@ -3,28 +3,30 @@ """Export PyTorch models to ONNX format.""" +import inspect from pathlib import Path from packaging import version import torch.hub import torch.onnx import torchvision.models -from transformers import BertConfig, GPT2Config, LlamaConfig -from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel -from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel -from superbench.benchmarks.model_benchmarks.pytorch_lstm import LSTMBenchmarkModel -from superbench.benchmarks.model_benchmarks.pytorch_llama import LlamaBenchmarkModel -from superbench.benchmarks.model_benchmarks.pytorch_mixtral import MixtralBenchmarkModel +import traceback -if MixtralBenchmarkModel is not None: - from transformers import MixtralConfig +from superbench.common.utils import logger class torch2onnxExporter(): """PyTorch model to ONNX exporter.""" def __init__(self): """Constructor.""" + from transformers import BertConfig, GPT2Config, LlamaConfig + from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel + from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel + from superbench.benchmarks.model_benchmarks.pytorch_lstm import LSTMBenchmarkModel + from superbench.benchmarks.model_benchmarks.pytorch_llama import LlamaBenchmarkModel + from superbench.benchmarks.model_benchmarks.pytorch_mixtral import MixtralBenchmarkModel + self.num_classes = 100 self.lstm_input_size = 256 self.benchmark_models = { @@ -129,6 +131,7 @@ def __init__(self): # Only include Mixtral models if MixtralBenchmarkModel is available if MixtralBenchmarkModel is not None: + from transformers import MixtralConfig self.benchmark_models.update( { 'mixtral-8x7b': @@ -270,3 +273,152 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512): del dummy_input torch.cuda.empty_cache() return file_name + + def export_huggingface_model(self, model, model_name, batch_size=1, seq_length=512, output_dir=None): + """Export a HuggingFace model to ONNX format. + + Args: + model: HuggingFace model instance to export. + model_name (str): Name for the exported ONNX model file. + batch_size (int): Batch size of input. Defaults to 1. + seq_length (int): Sequence length of input. Defaults to 512. + output_dir (str): Output directory path. If None, uses default path. + + Returns: + str: Exported ONNX model file path, or empty string if export fails. + """ + try: + # Use custom output directory if provided + output_path = Path(output_dir) if output_dir else self._onnx_model_path + file_name = str(output_path / (model_name + '.onnx')) + + # Put model in eval mode and move to CUDA if available + model.eval() + + # Disable cache to avoid DynamicCache issues with ONNX export + if hasattr(model.config, 'use_cache'): + model.config.use_cache = False + + if torch.cuda.is_available(): + model = model.cuda() + + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + # Get model's dtype for inputs + model_dtype = next(model.parameters()).dtype + + # Detect model type and create appropriate inputs + # Vision models use pixel_values, NLP models use input_ids + # Use HuggingFace's main_input_name property for automatic detection + main_input = getattr(model, 'main_input_name', 'input_ids') + is_vision_model = main_input == 'pixel_values' + + if is_vision_model: + # Vision models: use pixel_values (batch_size, channels, height, width) + # Derive C/H/W from model config rather than hard-coding 3x224x224 + num_channels = getattr(model.config, 'num_channels', 3) + image_size = getattr(model.config, 'image_size', 224) + if isinstance(image_size, (list, tuple)): + img_h, img_w = image_size[0], image_size[1] + else: + img_h, img_w = image_size, image_size + + dummy_input = torch.randn(batch_size, num_channels, img_h, img_w, dtype=model_dtype, device=device) + input_names = ['pixel_values'] + dynamic_axes = {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}} + + # Wrapper for vision models + class VisionModelWrapper(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, pixel_values): + outputs = self.model(pixel_values=pixel_values) + if hasattr(outputs, 'logits'): + return outputs.logits + elif hasattr(outputs, 'last_hidden_state'): + return outputs.last_hidden_state + else: + return outputs[0] if isinstance(outputs, (tuple, list)) else outputs + + wrapped_model = VisionModelWrapper(model) + export_args = (dummy_input, ) + else: + # NLP models: use input_ids and attention_mask + dummy_input = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device) + attention_mask = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device) + input_names = ['input_ids', 'attention_mask'] + dynamic_axes = { + 'input_ids': { + 0: 'batch_size', + 1: 'seq_length' + }, + 'attention_mask': { + 0: 'batch_size', + 1: 'seq_length' + }, + 'output': { + 0: 'batch_size', + 1: 'seq_length' + }, + } + + # Wrapper for NLP models + class NLPModelWrapper(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, input_ids, attention_mask): + outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) + if hasattr(outputs, 'logits'): + return outputs.logits + elif hasattr(outputs, 'last_hidden_state'): + return outputs.last_hidden_state + else: + return outputs[0] if isinstance(outputs, (tuple, list)) else outputs + + wrapped_model = NLPModelWrapper(model) + export_args = (dummy_input, attention_mask) + + # Export to ONNX for large models (>2GB), use external data format + model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3) + use_external_data = model_size_gb > 2.0 + + if use_external_data: + logger.info(f'Model size is {model_size_gb:.2f}GB, using external data format for ONNX export') + + export_kwargs = { + 'opset_version': 14, + 'do_constant_folding': True, + 'input_names': input_names, + 'output_names': ['output'], + 'dynamic_axes': dynamic_axes, + } + if use_external_data: + # PyTorch 2.8+ renamed 'use_external_data_format' to 'external_data' + sig = inspect.signature(torch.onnx.export) + if 'external_data' in sig.parameters: + export_kwargs['external_data'] = True + else: + export_kwargs['use_external_data_format'] = True + + torch.onnx.export( + wrapped_model, + export_args, + file_name, + **export_kwargs, + ) + + # Clean up + del dummy_input + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return file_name + + except Exception as e: + logger.error(f'Failed to export HuggingFace model to ONNX: {str(e)}') + logger.error(traceback.format_exc()) + return '' diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py new file mode 100644 index 000000000..9d8c55359 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py @@ -0,0 +1,429 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Hugging Face model loader for benchmarking.""" + +import os +from pathlib import Path +from typing import Optional, Tuple + +import torch +from transformers import ( + AutoModel, + AutoModelForCausalLM, + AutoConfig, + AutoTokenizer, + PreTrainedModel, + PretrainedConfig, +) + +from superbench.common.utils import logger +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig + + +class ModelLoadError(Exception): + """Exception raised when model loading fails.""" + pass + + +class ModelNotFoundError(ModelLoadError): + """Exception raised when model is not found.""" + pass + + +class ModelIncompatibleError(ModelLoadError): + """Exception raised when model is incompatible with ONNX export.""" + pass + + +class HuggingFaceModelLoader: + """Loads models from Hugging Face Hub for benchmarking. + + This class handles downloading, caching, and loading models from + Hugging Face Hub with support for authentication, device mapping, + and compatibility validation. + + Attributes: + cache_dir: Directory to cache downloaded models. + token: HuggingFace authentication token for private/gated models. + """ + def __init__(self, cache_dir: Optional[str] = None, token: Optional[str] = None): + """Initialize the HuggingFace model loader. + + Args: + cache_dir: Directory to cache downloaded models. If None, uses HF default. + token: HuggingFace authentication token for private/gated models. + """ + self.cache_dir = cache_dir or os.getenv('HF_HOME') or os.path.expanduser('~/.cache/huggingface') + self.token = token or os.getenv('HF_TOKEN') or os.getenv('HUGGING_FACE_HUB_TOKEN') + + # Ensure cache directory exists + Path(self.cache_dir).mkdir(parents=True, exist_ok=True) + + logger.info(f'HuggingFaceModelLoader initialized with cache_dir: {self.cache_dir}') + if self.token: + logger.info('Authentication token provided for private/gated models (token not logged)') + + def load_model( + self, + model_identifier: str, + torch_dtype: Optional[str] = None, + device: str = 'cuda', + revision: Optional[str] = None, + device_map: Optional[str] = None, + config: Optional[PretrainedConfig] = None, + **kwargs + ) -> Tuple[PreTrainedModel, PretrainedConfig, Optional[AutoTokenizer]]: + """Load a model from Hugging Face Hub. + + Args: + model_identifier: HF model ID (e.g., 'meta-llama/Llama-2-7b-hf'). + torch_dtype: Data type for model weights ('float32', 'float16', 'bfloat16'). + device: Device to load model on ('cuda', 'cpu'). + revision: Specific model version/commit/tag to use. + device_map: Device mapping strategy for large models. + config: Pre-downloaded model config. If None, downloads from Hub. + **kwargs: Additional arguments passed to from_pretrained(). + + Returns: + Tuple of (model, config, tokenizer). + + Raises: + ModelNotFoundError: If model doesn't exist on HF Hub. + ModelLoadError: If model loading fails for any reason. + """ + logger.info(f'Loading model: {model_identifier}') + + try: + # Convert torch_dtype string to torch dtype + dtype = self._get_torch_dtype(torch_dtype) if torch_dtype else None + + # Prepare loading kwargs + load_kwargs = {'cache_dir': self.cache_dir, 'revision': revision, **kwargs} + + # Add token if available + if self.token: + load_kwargs['token'] = self.token + + # Add dtype if specified + if dtype: + load_kwargs['torch_dtype'] = dtype + + # Load config (use pre-downloaded config if provided) + if config is None: + logger.info('Loading model configuration...') + config = AutoConfig.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs) + else: + logger.info('Using pre-downloaded model configuration.') + + # Load tokenizer (may fail for some models, that's ok) + tokenizer = None + try: + logger.info('Loading tokenizer...') + tokenizer = AutoTokenizer.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs) + except Exception as e: + logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.') + + # Load model + logger.info(f'Loading model weights (dtype={torch_dtype}, device={device})...') + model_kwargs = load_kwargs.copy() + model_kwargs['trust_remote_code'] = True + + # Handle device mapping for large models + effective_device_map = device_map + if device_map: + model_kwargs['device_map'] = device_map + elif device == 'cuda' and torch.cuda.is_available(): + # Don't set device_map if device is explicitly cuda + pass + elif device != 'cpu': + model_kwargs['device_map'] = device + effective_device_map = device + + # Pass pre-downloaded config to from_pretrained so any overrides take effect + if config is not None: + model_kwargs['config'] = config + + try: + model = AutoModel.from_pretrained(model_identifier, **model_kwargs) + except ValueError: + logger.info('AutoModel failed, trying AutoModelForCausalLM...') + model = AutoModelForCausalLM.from_pretrained(model_identifier, **model_kwargs) + + # Move to device if not using device_map + if not effective_device_map and device != 'auto': + model = model.to(device) + + logger.info( + f'Successfully loaded model: {model_identifier} ' + f'({self._get_model_size(model):.2f}M parameters)' + ) + + return model, config, tokenizer + + except OSError as e: + if 'not found' in str(e).lower() or '404' in str(e): + raise ModelNotFoundError( + f"Model '{model_identifier}' not found on Hugging Face Hub. " + f'Please check the model ID at https://huggingface.co/models' + ) from e + raise ModelLoadError(f"Failed to load model '{model_identifier}': {e}") from e + except Exception as e: + raise ModelLoadError(f"Unexpected error loading model '{model_identifier}': {e}") from e + + def load_model_from_config( + self, + config: ModelSourceConfig, + device: Optional[str] = None, + config_pretrained: Optional[PretrainedConfig] = None, + ) -> Tuple[PreTrainedModel, PretrainedConfig, Optional[AutoTokenizer]]: + """Load a model using ModelSourceConfig. + + Args: + config: ModelSourceConfig instance with loading parameters. + device: Device to load model on. If None, uses CUDA when available, else CPU. + config_pretrained: Pre-downloaded HF model config. If provided, skips redundant download. + + Returns: + Tuple of (model, config, tokenizer). + + Raises: + ValueError: If config source is not 'huggingface'. + ModelLoadError: If model loading fails. + """ + if not config.is_huggingface(): + raise ValueError(f"Cannot load model with source '{config.source}'. Use 'huggingface' source.") + + # Validate config + is_valid, error = config.validate() + if not is_valid: + raise ValueError(f'Invalid configuration: {error}') + + if device is None: + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + # Extract loading parameters + return self.load_model( + model_identifier=config.identifier, + torch_dtype=config.torch_dtype, + device=device, + revision=config.revision, + device_map=config.device_map, + config=config_pretrained, + **config.additional_kwargs + ) + + def _get_torch_dtype(self, dtype_str: str) -> torch.dtype: + """Convert dtype string to torch.dtype. + + Args: + dtype_str: String representation of dtype ('float32', 'float16', etc.). + + Returns: + Corresponding torch.dtype. + + Raises: + ValueError: If dtype string is invalid or unsupported for standard HF loading. + """ + normalized_dtype = dtype_str.lower() + if normalized_dtype == 'int8': + raise ValueError( + "Unsupported dtype 'int8' for Hugging Face model loading via torch_dtype. " + 'Use a dedicated quantization/loading path for int8 models or apply int8 quantization ' + 'after export.' + ) + dtype_map = { + 'float32': torch.float32, + 'float16': torch.float16, + 'bfloat16': torch.bfloat16, + 'fp32': torch.float32, + 'fp16': torch.float16, + 'bf16': torch.bfloat16, + } + + if normalized_dtype not in dtype_map: + raise ValueError(f"Invalid dtype '{dtype_str}'.Must be one of {list(dtype_map.keys())}") + + return dtype_map[normalized_dtype] + + def _get_model_size(self, model: PreTrainedModel) -> float: + """Calculate model size in millions of parameters. + + Args: + model: The model to measure. + + Returns: + Number of parameters in millions. + """ + return float(sum(p.numel() for p in model.parameters())) / 1_000_000 + + @staticmethod + def estimate_param_count_from_config(hf_config) -> Optional[int]: + """Estimate parameter count from a HuggingFace config without instantiating the model. + + This avoids allocating tens/hundreds of GB of CPU RAM for large models (e.g. 70B). + The estimate covers embedding + transformer layers + LM head for common architectures. + + Args: + hf_config: A HuggingFace PretrainedConfig object. + + Returns: + int: Estimated number of parameters, or None if estimation is not possible. + """ + try: + vocab = getattr(hf_config, 'vocab_size', 0) + hidden = getattr(hf_config, 'hidden_size', 0) + layers = getattr(hf_config, 'num_hidden_layers', 0) + intermediate = getattr(hf_config, 'intermediate_size', hidden * 4) + num_heads = getattr(hf_config, 'num_attention_heads', 0) + num_kv_heads = getattr(hf_config, 'num_key_value_heads', num_heads) + head_dim = hidden // num_heads if num_heads > 0 else 0 + + if vocab == 0 or hidden == 0 or layers == 0: + return None + + # Embeddings: token + (optional) position + max_pos = getattr(hf_config, 'max_position_embeddings', 0) + has_pos_embed = getattr(hf_config, 'position_embedding_type', None) not in ('rotary', None) + embed_params = vocab * hidden + if has_pos_embed and max_pos > 0: + embed_params += max_pos * hidden + + # Per transformer layer: + # Self-attention: Q, K, V projections + output projection + # MLP: gate_proj + up_proj + down_proj (LLaMA-style) or fc1 + fc2 + # Layer norms: 2 * hidden + qkv_params = (num_heads * head_dim + 2 * num_kv_heads * head_dim) * hidden + attn_out = hidden * hidden + # For gated MLPs (LLaMA/Mistral), there are 3 matrices; otherwise 2 + has_gate = getattr(hf_config, 'hidden_act', 'gelu') in ('silu', 'swiglu') + mlp_params = (3 if has_gate else 2) * hidden * intermediate + norm_params = 2 * hidden + layer_params = qkv_params + attn_out + mlp_params + norm_params + + # MoE: if num_local_experts > 1, MLP is replicated per expert + num_experts = getattr(hf_config, 'num_local_experts', 1) + if num_experts > 1: + # Router + replicated MLP experts (attention is shared) + router_params = hidden * num_experts + layer_params = qkv_params + attn_out + norm_params + \ + num_experts * mlp_params + router_params + + total_params = embed_params + layers * layer_params + # LM head (often tied to embedding, but count it for safety) + total_params += vocab * hidden + # Final layer norm + total_params += hidden + + return total_params + except Exception as e: + logger.warning(f'Could not estimate param count from config: {e}') + return None + + @staticmethod + def estimate_memory(param_count, precision_str, mode='training'): + """Estimate GPU memory required for a model. + + For training: weights + gradients + optimizer states (Adam uses 2x) = 4x multiplier. + For inference: weights only + overhead for runtime buffers = ~1.2x multiplier. + + Args: + param_count (int): Number of model parameters. + precision_str (str): Precision string ('float32', 'float16', 'bfloat16', 'fp16', 'fp32', 'int8'). + mode (str): 'training' or 'inference'. + + Returns: + tuple: (estimated_bytes, gpu_total_bytes, fits) where fits is True if + the model is estimated to fit in available memory. + """ + precision_lower = precision_str.lower() + if precision_lower in ('float16', 'fp16', 'bfloat16', 'bf16'): + bytes_per_param = 2 + elif precision_lower in ('int8', ): + bytes_per_param = 1 + else: + bytes_per_param = 4 + + if mode == 'training': + # weights + gradients + 2x Adam optimizer states = 4x + multiplier = 4 + else: + # inference: weights + runtime overhead (~20%) + multiplier = 1.2 + + estimated_bytes = int(param_count * bytes_per_param * multiplier) + + gpu_available = torch.cuda.is_available() + if not gpu_available: + try: + import psutil + sys_mem = psutil.virtual_memory().total + except ImportError: + logger.warning('psutil not installed — cannot check system memory. Skipping memory check.') + return estimated_bytes, 0, True + max_gpu_mem = 80 * (1024**3) # 80GB — largest common single-GPU memory + effective_mem = min(sys_mem, max_gpu_mem) + fits = (estimated_bytes / effective_mem) < 0.85 + return estimated_bytes, effective_mem, fits + + gpu_mem = torch.cuda.get_device_properties(0).total_memory + # Use 85% threshold to leave headroom for activations, framework overhead, etc. + fits = (estimated_bytes / gpu_mem) < 0.85 + return estimated_bytes, gpu_mem, fits + + @staticmethod + def check_memory_fits(model_identifier, hf_config, precision_str, mode='training', token=None): + """Check if a model fits in GPU memory before downloading weights. + + Downloads only the config (few KB) via hf_config, estimates memory, and returns + whether the model fits. Use this before calling load_model() to avoid wasting + time downloading large models that won't fit. + + Args: + model_identifier (str): HF model ID (for logging). + hf_config: A HuggingFace PretrainedConfig object. + precision_str (str): Precision string ('float32', 'float16', etc.). + mode (str): 'training' or 'inference'. + token (str, optional): HF token (unused, kept for API consistency). + + Returns: + tuple: (fits, param_count_millions, estimated_gb, available_gb) + fits is True if model is estimated to fit. + """ + param_count = HuggingFaceModelLoader.estimate_param_count_from_config(hf_config) + if param_count is None: + logger.warning( + f'Could not estimate param count from config for {model_identifier}. ' + f'Proceeding with download — memory check skipped.' + ) + return True, 0, 0, 0 + + estimated_bytes, available_bytes, fits = HuggingFaceModelLoader.estimate_memory( + param_count, precision_str, mode=mode + ) + + param_millions = param_count / 1e6 + estimated_gb = estimated_bytes / 1e9 + available_gb = available_bytes / 1e9 + + if fits: + logger.info( + f'Model {model_identifier} ({param_millions:.1f}M params) estimated to need ' + f'~{estimated_gb:.1f}GB for {mode}, fits in available memory ({available_gb:.1f}GB).' + ) + else: + mem_type = 'GPU memory' if torch.cuda.is_available() else 'system RAM' + logger.error( + f'Model {model_identifier} ({param_millions:.1f}M params) estimated to need ' + f'~{estimated_gb:.1f}GB for {mode} (weights' + f'{" + gradients + optimizer states" if mode == "training" else " + runtime overhead"}), ' + f'which exceeds available {mem_type} ({available_gb:.1f}GB). ' + f'Skipping benchmark. Use a smaller model variant or a machine with more memory.' + ) + + return fits, param_millions, estimated_gb, available_gb + + def __repr__(self) -> str: + """String representation of the loader.""" + token_status = 'authenticated' if self.token else 'no authentication' + return f"HuggingFaceModelLoader(cache_dir='{self.cache_dir}', {token_status})" diff --git a/superbench/benchmarks/micro_benchmarks/model_source_config.py b/superbench/benchmarks/micro_benchmarks/model_source_config.py new file mode 100644 index 000000000..99ca31870 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/model_source_config.py @@ -0,0 +1,89 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Configuration classes for model source and loading.""" + +from dataclasses import dataclass, field +from typing import Optional, Dict, Any, Tuple + + +@dataclass +class ModelSourceConfig: + """Configuration for model source and loading parameters. + + This class encapsulates all configuration needed to load a model + from either in-house definitions or Hugging Face Hub. + + Attributes: + source: Source of the model ('in-house' or 'huggingface'). + identifier: Model name (in-house) or model ID (HuggingFace). + hf_token: Optional HuggingFace authentication token for private/gated models. + torch_dtype: Data type for model weights ('float32', 'float16', 'bfloat16'). + revision: Specific model version/commit/tag to use. + cache_dir: Directory to cache downloaded models. + device_map: Device mapping strategy for model loading. + use_auth_token: Deprecated, use hf_token instead. + additional_kwargs: Additional keyword arguments for model loading. + """ + + source: str = 'in-house' + identifier: str = '' + hf_token: Optional[str] = None + torch_dtype: str = 'float32' + revision: Optional[str] = None + cache_dir: Optional[str] = None + device_map: Optional[str] = None + use_auth_token: Optional[str] = None # Deprecated + additional_kwargs: Dict[str, Any] = field(default_factory=dict) + + def __post_init__(self): + """Post-initialization validation and normalization.""" + # Handle deprecated use_auth_token + if self.use_auth_token is not None and self.hf_token is None: + self.hf_token = self.use_auth_token + + # Normalize and validate source + self.source = self.source.lower() + if self.source not in ['in-house', 'huggingface']: + raise ValueError(f"Invalid model source '{self.source}'. Must be 'in-house' or 'huggingface'.") + + # Validate torch_dtype + valid_dtypes = ['float32', 'float16', 'bfloat16', 'int8'] + if self.torch_dtype not in valid_dtypes: + raise ValueError(f"Invalid torch_dtype '{self.torch_dtype}'. Must be one of {valid_dtypes}.") + + # Validate identifier is provided + if not self.identifier: + raise ValueError('Model identifier must be provided.') + + def validate(self) -> Tuple[bool, str]: + """Validate configuration parameters. + + Returns: + Tuple of (is_valid, error_message). + If is_valid is True, error_message is empty. + """ + # Check identifier is not empty for HuggingFace models + if self.source == 'huggingface': + if not self.identifier or not self.identifier.strip(): + return (False, 'HuggingFace model identifier cannot be empty') + + return (True, '') + + def is_huggingface(self) -> bool: + """Check if this configuration is for a HuggingFace model. + + Returns: + True if source is 'huggingface', False otherwise. + """ + return self.source == 'huggingface' + + def __repr__(self) -> str: + """String representation of the configuration.""" + token_status = 'set' if self.hf_token else 'not set' + return ( + f"ModelSourceConfig(source='{self.source}', " + f"identifier='{self.identifier}', " + f"torch_dtype='{self.torch_dtype}', " + f'hf_token={token_status})' + ) diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py index a472af121..2e0fff826 100644 --- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py @@ -14,6 +14,8 @@ from superbench.common.utils import logger from superbench.benchmarks import BenchmarkRegistry, Platform, Precision from superbench.benchmarks.micro_benchmarks import MicroBenchmark +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader class ORTInferenceBenchmark(MicroBenchmark): @@ -96,6 +98,32 @@ def add_parser_arguments(self): help='The number of test step for benchmarking.', ) + # HuggingFace model arguments + self._parser.add_argument( + '--model_source', + type=str, + choices=['in-house', 'huggingface'], + default='in-house', + required=False, + help='Source of the model: in-house (default) or huggingface.', + ) + + self._parser.add_argument( + '--model_identifier', + type=str, + default=None, + required=False, + help='Model identifier for HuggingFace models (e.g., bert-base-uncased).', + ) + + self._parser.add_argument( + '--seq_length', + type=int, + default=512, + required=False, + help='Sequence length for transformer models.', + ) + def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -113,6 +141,11 @@ def _preprocess(self): 3: ort.GraphOptimizationLevel.ORT_ENABLE_ALL, } + # Handle HuggingFace models if specified + if self._args.model_source == 'huggingface': + return self._preprocess_huggingface_models() + + # Original in-house model processing for model in self._args.pytorch_models: if hasattr(torchvision.models, model): data_type = Precision.FLOAT16.value if self._args.precision == Precision.FLOAT16 \ @@ -136,11 +169,118 @@ def _preprocess(self): return True + def _preprocess_huggingface_models(self): + """Preprocess HuggingFace models for ONNX Runtime inference. + + Returns: + bool: True if preprocessing succeeds. + """ + import os + + if not self._args.model_identifier: + logger.error('--model_identifier is required when using --model_source huggingface') + return False + + try: + logger.info(f'Loading HuggingFace model: {self._args.model_identifier}') + + # Step 1: Pre-download memory check — download config only (few KB) + from transformers import AutoConfig + hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN') + load_kwargs = {} + if hf_token: + load_kwargs['token'] = hf_token + hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs) + + precision_str = self._args.precision.value if self._args.precision != Precision.INT8 else 'float32' + fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits( + self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token + ) + if not fits: + return False + + # Step 2: Proceed with model download and ONNX export + + # Get GPU rank to create unique file paths and avoid race conditions + # when multiple processes export the same model simultaneously + gpu_rank = os.getenv('CUDA_VISIBLE_DEVICES', '0') + proc_rank = os.getenv('PROC_RANK', gpu_rank) + + # Create model source config - load on CPU to avoid accelerate dispatching + # model across multiple GPUs which causes device mismatch during ONNX export + model_config = ModelSourceConfig( + source='huggingface', + identifier=self._args.model_identifier, + hf_token=hf_token, + torch_dtype=self._args.precision.value if self._args.precision != Precision.INT8 else 'float32', + device_map=None, + ) + + # Load model from HuggingFace on CPU + loader = HuggingFaceModelLoader() + hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu') + from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter + exporter = torch2onnxExporter() + + model_name = self._args.model_identifier.replace('/', '_') + + # Prepare output path - use proc_rank subdirectory to avoid race conditions + # when multiple processes export the same model simultaneously + proc_output_path = self.__model_cache_path / f'rank_{proc_rank}' + proc_output_path.mkdir(parents=True, exist_ok=True) + + # For INT8, export as float32 first then quantize (matching in-house model behavior). + # For other precisions, include precision in the model name directly. + if self._args.precision == Precision.INT8: + export_precision = Precision.FLOAT32.value + else: + export_precision = self._args.precision.value + model_name_with_precision = f'{model_name}.{export_precision}' + + # Export directly to final destination to avoid path issues with external data + onnx_path = exporter.export_huggingface_model( + model=hf_model, + model_name=model_name_with_precision, + batch_size=self._args.batch_size, + seq_length=self._args.seq_length, + output_dir=str(proc_output_path), + ) + + if not onnx_path: + logger.error(f'Failed to export {self._args.model_identifier} to ONNX') + return False + + # Apply INT8 quantization if requested (matching in-house model behavior) + if self._args.precision == Precision.INT8: + from onnxruntime.quantization import quantize_dynamic + quantized_path = str(proc_output_path / f'{model_name}.{Precision.INT8.value}.onnx') + quantize_dynamic(onnx_path, quantized_path) + logger.info('Applied INT8 quantization to HuggingFace model') + + # Update model list and cache path for benchmarking + self._args.pytorch_models = [model_name] + self.__model_cache_path = proc_output_path + + logger.info('Successfully prepared HuggingFace model for ORT inference') + return True + + except Exception as e: + logger.error(f'Failed to prepare HuggingFace model: {str(e)}') + import traceback + logger.error(traceback.format_exc()) + return False + def _benchmark(self): """Implementation for benchmarking.""" import onnxruntime as ort precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8'} + # Require CUDAExecutionProvider — this benchmark targets GPU inference + available = ort.get_available_providers() + if 'CUDAExecutionProvider' not in available: + logger.error(f'CUDAExecutionProvider is not available (available: {available}).') + return False + for model in self._args.pytorch_models: sess_options = ort.SessionOptions() sess_options.graph_optimization_level = self.__graph_opt_level[self._args.graph_opt_level] @@ -177,15 +317,33 @@ def __inference(self, ort_sess): elapse_times (List[float]): latency of every iterations. """ precision = np.float16 if self._args.precision == Precision.FLOAT16 else np.float32 - input_tensor = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision) + + # Get input names from the ONNX session to determine input format + input_names = [input.name for input in ort_sess.get_inputs()] + + # Determine input format based on what the model expects + if 'pixel_values' in input_names: + # Vision model: use pixel_values (batch_size, 3, 224, 224) + pixel_values = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision) + inputs = {'pixel_values': pixel_values} + elif 'input_ids' in input_names: + # NLP model: use input_ids and attention_mask + seq_len = getattr(self._args, 'seq_length', 512) + input_ids = np.random.randint(0, 30000, (self._args.batch_size, seq_len)).astype(np.int64) + attention_mask = np.ones((self._args.batch_size, seq_len), dtype=np.int64) + inputs = {'input_ids': input_ids, 'attention_mask': attention_mask} + else: + # Default for in-house torchvision models: use 'input' (batch_size, 3, 224, 224) + input_tensor = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision) + inputs = {'input': input_tensor} for i in range(self._args.num_warmup): - ort_sess.run(None, {'input': input_tensor}) + ort_sess.run(None, inputs) elapse_times = list() for i in range(self._args.num_steps): start = time.time() - ort_sess.run(None, {'input': input_tensor}) + ort_sess.run(None, inputs) end = time.time() elapse_times.append((end - start) * 1000) diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py index 4d5a5b4b7..5153073a3 100644 --- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py @@ -3,13 +3,18 @@ """TensorRT inference micro-benchmark.""" +import os import re from pathlib import Path +import torch + from superbench.common.utils import logger from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader class TensorRTInferenceBenchmark(MicroBenchmarkWithInvoke): @@ -71,6 +76,24 @@ def add_parser_arguments(self): help='Run at least N inference iterations.', ) + # HuggingFace model arguments + self._parser.add_argument( + '--model_source', + type=str, + choices=['in-house', 'huggingface'], + default='in-house', + required=False, + help='Source of the model: in-house (default) or huggingface.', + ) + + self._parser.add_argument( + '--model_identifier', + type=str, + default=None, + required=False, + help='Model identifier for HuggingFace models (e.g., bert-base-uncased).', + ) + def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -82,6 +105,11 @@ def _preprocess(self): self.__bin_path = str(Path(self._args.bin_dir) / self._bin_name) + # Handle HuggingFace models if specified + if self._args.model_source == 'huggingface': + return self._preprocess_huggingface_models() + + # Original in-house model processing exporter = torch2onnxExporter() for model in self._args.pytorch_models: if not (exporter.check_torchvision_model(model) or exporter.check_benchmark_model(model)): @@ -102,9 +130,8 @@ def _preprocess(self): # model options f'--onnx={onnx_model}', # build options - '--explicitBatch', f'--optShapes=input:{input_shape}', - '--workspace=8192', + '--memPoolSize=workspace:8192M', None if self._args.precision == 'fp32' else f'--{self._args.precision}', # inference options f'--iterations={self._args.iterations}', @@ -115,6 +142,160 @@ def _preprocess(self): return True + def _preprocess_huggingface_models(self): + """Preprocess HuggingFace models for TensorRT inference. + + Returns: + bool: True if preprocessing succeeds. + """ + import os + from transformers import AutoConfig + + if not self._args.model_identifier: + logger.error('--model_identifier is required when using --model_source huggingface') + return False + + try: + # Step 1: Pre-download memory check — download only the config (a few KB) + # and estimate whether the full model will fit in GPU memory. + hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN') + load_kwargs = {} + if hf_token: + load_kwargs['token'] = hf_token + + hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs) + precision_str = self._args.precision # already a string: 'fp16', 'fp32', 'int8' + fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits( + self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token + ) + if not fits: + return False + + # Step 2: Download and load the full model + + # Get GPU rank to create unique file paths and avoid race conditions + # when multiple processes export the same model simultaneously + gpu_rank = os.getenv('CUDA_VISIBLE_DEVICES', '0') + proc_rank = os.getenv('PROC_RANK', gpu_rank) + + # Create model source config - load on CPU to avoid accelerate dispatching + # model across multiple GPUs which causes device mismatch during ONNX export. + # TensorRT handles precision internally via --fp16/--int8 flags, + # so the ONNX model is always exported in float32. + model_config = ModelSourceConfig( + source='huggingface', + identifier=self._args.model_identifier, + hf_token=hf_token, + torch_dtype='float32', + device_map=None, + ) + + logger.info(f'Loading HuggingFace model: {self._args.model_identifier}') + + # Load model from HuggingFace on CPU + loader = HuggingFaceModelLoader() + hf_model, hf_config, _ = loader.load_model_from_config(model_config, device='cpu') + self._hf_config = hf_config + exporter = torch2onnxExporter() + + model_name = self._args.model_identifier.replace('/', '_') + + # Prepare output path - use proc_rank subdirectory to avoid race conditions + # when multiple processes export the same model simultaneously + output_dir = str(Path(torch.hub.get_dir()) / 'checkpoints' / f'trt_rank_{proc_rank}') + os.makedirs(output_dir, exist_ok=True) + + onnx_path = exporter.export_huggingface_model( + model=hf_model, + model_name=model_name, + batch_size=self._args.batch_size, + seq_length=self._args.seq_length, + output_dir=output_dir, + ) + + if not onnx_path: + logger.error(f'Failed to export {self._args.model_identifier} to ONNX') + return False + + # Determine input shape based on model type by checking ONNX file + import onnx as onnx_lib + onnx_model = onnx_lib.load(onnx_path) + + # Filter out initializers from graph.input to get only runtime inputs + initializer_names = {init.name for init in onnx_model.graph.initializer} + runtime_inputs = [inp for inp in onnx_model.graph.input if inp.name not in initializer_names] + + # Get the first runtime input to determine shape and name + input_name = runtime_inputs[0].name + + # Vision models typically have 4D input (batch, channels, height, width) + # NLP models typically have 2D input (batch, sequence) + if input_name == 'pixel_values' or len(runtime_inputs[0].type.tensor_type.shape.dim) == 4: + # Vision model: derive C/H/W from ONNX graph or HF config + dims = runtime_inputs[0].type.tensor_type.shape.dim + # dims[0] is batch, dims[1:] are C, H, W + c_dim = dims[1].dim_value if dims[1].dim_value > 0 else None + h_dim = dims[2].dim_value if dims[2].dim_value > 0 else None + w_dim = dims[3].dim_value if dims[3].dim_value > 0 else None + + # Fall back to HF config metadata when ONNX dims are dynamic/unknown + if hasattr(self, '_hf_config'): + channels = c_dim or getattr(self._hf_config, 'num_channels', 3) + image_size = getattr(self._hf_config, 'image_size', 224) + if isinstance(image_size, (list, tuple)): + height = h_dim or image_size[0] + width = w_dim or image_size[1] + else: + height = h_dim or image_size + width = w_dim or image_size + else: + channels = c_dim or 3 + height = h_dim or 224 + width = w_dim or 224 + + input_shapes = f'{input_name}:{self._args.batch_size}x{channels}x{height}x{width}' + else: + # NLP model: batch x sequence - need to specify all inputs with same batch and seq length + seq_len = getattr(self._args, 'seq_length', 512) + shapes_list = [] + for inp in runtime_inputs: + inp_name = inp.name + num_dims = len(inp.type.tensor_type.shape.dim) + if num_dims == 2: + # Standard 2D input: batch x sequence + shapes_list.append(f'{inp_name}:{self._args.batch_size}x{seq_len}') + elif num_dims == 4: + # 4D input (rare for NLP, but handle it) + shapes_list.append(f'{inp_name}:{self._args.batch_size}x1x{seq_len}x{seq_len}') + else: + # Default to 2D + shapes_list.append(f'{inp_name}:{self._args.batch_size}x{seq_len}') + input_shapes = ','.join(shapes_list) + + # Build TensorRT command with correct input name + args = [ + self.__bin_path, + f'--onnx={onnx_path}', + f'--optShapes={input_shapes}', + '--memPoolSize=workspace:8192M', + None if self._args.precision == 'fp32' else f'--{self._args.precision}', + f'--iterations={self._args.iterations}', + '--percentile=99', + ] + self._commands.append(' '.join(filter(None, args))) + + # Store model name for result processing + self._args.pytorch_models = [self._args.model_identifier.replace('/', '_')] + + logger.info('Successfully prepared HuggingFace model for TensorRT inference') + return True + + except Exception as e: + logger.error(f'Failed to prepare HuggingFace model: {str(e)}') + import traceback + logger.error(traceback.format_exc()) + return False + def _process_raw_result(self, cmd_idx, raw_output): """Function to parse raw results and save the summarized results. diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py new file mode 100644 index 000000000..55c378500 --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py @@ -0,0 +1,103 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""End-to-end integration tests for HuggingFace model loading. + +These tests actually download and load models from HuggingFace Hub. +The test class is skipped unless ``SB_TEST_HF_E2E=1`` is set, and +``test_load_model_to_gpu`` is additionally skipped when +``torch.cuda.is_available()`` is false. +""" + +import os + +import pytest +import torch + +pytest.importorskip('transformers') + +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig + + +@pytest.mark.skipif(os.environ.get('SB_TEST_HF_E2E', '0') != '1', reason='Skip HF E2E tests. Set SB_TEST_HF_E2E=1 to enable.') +class TestHuggingFaceE2E: + """End-to-end tests for HuggingFace model loading.""" + @pytest.fixture + def loader(self): + """Create a loader instance.""" + return HuggingFaceModelLoader(cache_dir='/tmp/hf_test_cache') + + def test_load_tiny_bert_model(self, loader): + """Test loading a tiny BERT model from HuggingFace Hub. + + Uses prajjwal1/bert-tiny which is a small public BERT model (~17MB). + """ + model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + + assert model is not None + assert config is not None + assert config.model_type == 'bert' + + # Verify model can do a forward pass + dummy_input = torch.randint(0, 1000, (1, 10)) + with torch.no_grad(): + output = model(dummy_input) + assert output is not None + + def test_load_distilgpt2_model(self, loader): + """Test loading DistilGPT2 model from HuggingFace Hub. + + Uses distilbert/distilgpt2 which is a small public GPT-2 model (~82MB). + """ + model, config, tokenizer = loader.load_model('distilbert/distilgpt2', device='cpu') + + assert model is not None + assert config is not None + assert config.model_type == 'gpt2' + + # Verify model can do a forward pass + dummy_input = torch.randint(0, 1000, (1, 10)) + with torch.no_grad(): + output = model(dummy_input) + assert output is not None + + def test_load_model_from_config(self, loader): + """Test loading model using ModelSourceConfig via load_model_from_config.""" + config = ModelSourceConfig(source='huggingface', identifier='prajjwal1/bert-tiny', torch_dtype='float32') + + model, hf_config, tokenizer = loader.load_model_from_config(config, device='cpu') + + assert model is not None + assert hf_config.model_type == 'bert' + + def test_load_model_with_dtype(self, loader): + """Test loading model and converting dtype after load.""" + model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + + # Convert to float32 after loading + model = model.float() + + # Check model parameters are float32 + param = next(model.parameters()) + assert param.dtype == torch.float32 + + @pytest.mark.skipif(not torch.cuda.is_available(), reason='Requires GPU') + def test_load_model_to_gpu(self, loader): + """Test loading model and moving to GPU.""" + model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + + # Move to GPU manually + model = model.cuda() + + # Check model is on GPU + param = next(model.parameters()) + assert param.device.type == 'cuda' + + def test_architecture_detection(self, loader): + """Test that architecture is correctly detected from loaded model.""" + model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + + # Architecture should be detected from config + assert config.model_type is not None + assert 'bert' in config.model_type.lower() diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py new file mode 100644 index 000000000..e679fb068 --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py @@ -0,0 +1,117 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Unit tests for HuggingFaceModelLoader.""" + +import pytest +import torch +from unittest.mock import MagicMock, patch + +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import ( + HuggingFaceModelLoader, + ModelNotFoundError, +) +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig + + +class TestHuggingFaceModelLoader: + """Test cases for HuggingFaceModelLoader class.""" + @pytest.fixture + def loader(self, tmp_path): + """Create a loader instance for testing.""" + return HuggingFaceModelLoader(cache_dir=str(tmp_path / 'test_cache'), token=None) + + def test_initialization(self, loader, tmp_path): + """Test loader initialization.""" + assert loader.cache_dir == str(tmp_path / 'test_cache') + assert loader.token is None + + def test_initialization_with_env_token(self, monkeypatch, tmp_path): + """Test loader picks up token from environment.""" + monkeypatch.setenv('HF_TOKEN', 'env_token') + monkeypatch.setenv('HF_HOME', str(tmp_path / 'hf_cache')) + loader = HuggingFaceModelLoader() + assert loader.token == 'env_token' + + def test_get_torch_dtype_valid(self, loader): + """Test torch dtype conversion.""" + assert loader._get_torch_dtype('float32') == torch.float32 + assert loader._get_torch_dtype('float16') == torch.float16 + assert loader._get_torch_dtype('fp16') == torch.float16 + assert loader._get_torch_dtype('bfloat16') == torch.bfloat16 + + def test_get_torch_dtype_invalid(self, loader): + """Test invalid dtype raises error.""" + with pytest.raises(ValueError, match='Invalid dtype'): + loader._get_torch_dtype('invalid_dtype') + + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoModel') + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoConfig') + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoTokenizer') + def test_load_model_success(self, mock_tokenizer, mock_config, mock_model, loader): + """Test successful model loading.""" + # Mock config + mock_cfg = MagicMock() + mock_cfg.model_type = 'bert' + mock_config.from_pretrained.return_value = mock_cfg + + # Mock model + mock_mdl = MagicMock() + mock_mdl.parameters.return_value = [torch.randn(100, 100)] + mock_mdl.to.return_value = mock_mdl + mock_model.from_pretrained.return_value = mock_mdl + + # Mock tokenizer + mock_tok = MagicMock() + mock_tokenizer.from_pretrained.return_value = mock_tok + + model, config, tokenizer = loader.load_model('test/model', device='cpu') + + assert model == mock_mdl + assert config == mock_cfg + assert tokenizer == mock_tok + + # Verify mocks were called with correct arguments + mock_config.from_pretrained.assert_called_once() + call_kwargs = mock_config.from_pretrained.call_args + assert call_kwargs[0][0] == 'test/model' + assert call_kwargs[1]['trust_remote_code'] is True + assert call_kwargs[1]['cache_dir'] == loader.cache_dir + + mock_model.from_pretrained.assert_called_once() + model_call_kwargs = mock_model.from_pretrained.call_args + assert model_call_kwargs[1]['trust_remote_code'] is True + assert model_call_kwargs[1]['cache_dir'] == loader.cache_dir + + mock_tokenizer.from_pretrained.assert_called_once() + + # Verify model was moved to the requested device + mock_mdl.to.assert_called_once_with('cpu') + + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoTokenizer') + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoModel') + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoConfig') + def test_load_model_not_found(self, mock_config, mock_model, mock_tokenizer, loader): + """Test loading non-existent model.""" + mock_config.from_pretrained.side_effect = OSError('404 Client Error') + + with pytest.raises(ModelNotFoundError, match='not found'): + loader.load_model('nonexistent/model') + + def test_load_model_from_config_invalid_source(self, loader): + """Test loading with invalid source in config.""" + config = ModelSourceConfig(source='in-house', identifier='bert-base') + + with pytest.raises(ValueError, match='Cannot load model'): + loader.load_model_from_config(config) + + def test_get_model_size(self, loader): + """Test model size calculation.""" + mock_model = MagicMock() + mock_model.parameters.return_value = [ + torch.randn(1000, 1000), # 1M params + torch.randn(500, 500), # 0.25M params + ] + + size = loader._get_model_size(mock_model) + assert abs(size - 1.25) < 0.01 # Should be ~1.25M diff --git a/tests/benchmarks/micro_benchmarks/test_model_source_config.py b/tests/benchmarks/micro_benchmarks/test_model_source_config.py new file mode 100644 index 000000000..9d9f7f35e --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_model_source_config.py @@ -0,0 +1,73 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Unit tests for ModelSourceConfig.""" + +import pytest +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig + + +class TestModelSourceConfig: + """Test cases for ModelSourceConfig class.""" + def test_default_config(self): + """Test default configuration.""" + config = ModelSourceConfig(identifier='bert-base') + assert config.source == 'in-house' + assert config.identifier == 'bert-base' + assert config.torch_dtype == 'float32' + assert config.hf_token is None + + def test_huggingface_config(self): + """Test HuggingFace configuration.""" + config = ModelSourceConfig(source='huggingface', identifier='meta-llama/Llama-2-7b-hf', torch_dtype='float16') + assert config.source == 'huggingface' + assert config.identifier == 'meta-llama/Llama-2-7b-hf' + assert config.torch_dtype == 'float16' + + def test_invalid_source(self): + """Test invalid source raises error.""" + with pytest.raises(ValueError, match='Invalid model source'): + ModelSourceConfig(source='invalid', identifier='test') + + def test_invalid_dtype(self): + """Test invalid dtype raises error.""" + with pytest.raises(ValueError, match='Invalid torch_dtype'): + ModelSourceConfig(identifier='test', torch_dtype='invalid') + + def test_missing_identifier(self): + """Test missing identifier raises error.""" + with pytest.raises(ValueError, match='identifier must be provided'): + ModelSourceConfig(identifier='') + + def test_validate_huggingface_empty(self): + """Test validation of empty HuggingFace model identifier.""" + config = ModelSourceConfig(source='huggingface', identifier=' ') + is_valid, message = config.validate() + assert not is_valid + assert 'cannot be empty' in message + + def test_validate_valid_huggingface(self): + """Test validation of valid HuggingFace model.""" + config = ModelSourceConfig(source='huggingface', identifier='meta-llama/Llama-2-7b-hf') + is_valid, message = config.validate() + assert is_valid + assert message == '' + + def test_validate_valid_huggingface_short_name(self): + """Test validation of valid HuggingFace model with short name (no org).""" + config = ModelSourceConfig(source='huggingface', identifier='bert-base-uncased') + is_valid, message = config.validate() + assert is_valid + assert message == '' + + def test_is_huggingface(self): + """Test is_huggingface method.""" + hf_config = ModelSourceConfig(source='huggingface', identifier='test/model') + inhouse_config = ModelSourceConfig(source='in-house', identifier='bert-base') + assert hf_config.is_huggingface() is True + assert inhouse_config.is_huggingface() is False + + def test_deprecated_use_auth_token(self): + """Test deprecated use_auth_token parameter.""" + config = ModelSourceConfig(identifier='test', use_auth_token='old_token') + assert config.hf_token == 'old_token' diff --git a/tests/helper/decorator.py b/tests/helper/decorator.py index ff08469ac..8d0ad314b 100644 --- a/tests/helper/decorator.py +++ b/tests/helper/decorator.py @@ -13,6 +13,7 @@ pytorch_test = unittest.skipIf(os.environ.get('SB_TEST_PYTORCH', '1') == '0', 'Skip PyTorch tests.') directx_test = unittest.skipIf(os.environ.get('SB_TEST_DIRECTX', '0') == '0', 'Skip DirectX tests.') +hf_e2e_test = unittest.skipUnless(os.environ.get('SB_TEST_HF_E2E', '0') == '1', 'Skip HF E2E tests. Set SB_TEST_HF_E2E=1 to enable.') def load_data(filepath): From 6139332094fc378fa6f598646b2cc5ea7df6b8ee Mon Sep 17 00:00:00 2001 From: root Date: Fri, 29 May 2026 21:23:06 +0000 Subject: [PATCH 02/12] fixing PR comments --- .../huggingface_model_loader.py | 104 ++++++++++++++--- .../micro_benchmarks/model_source_config.py | 6 +- .../ort_inference_performance.py | 99 +++++++++++++--- .../tensorrt_inference_performance.py | 106 ++++++++++++++++-- .../micro_benchmarks/test_huggingface_e2e.py | 11 +- .../test_huggingface_loader.py | 8 +- tests/helper/decorator.py | 1 - 7 files changed, 287 insertions(+), 48 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py index 9d8c55359..c7f28a2eb 100644 --- a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py +++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py @@ -4,6 +4,7 @@ """Hugging Face model loader for benchmarking.""" import os +import re from pathlib import Path from typing import Optional, Tuple @@ -20,6 +21,39 @@ from superbench.common.utils import logger from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig +# Strict allow-list for HuggingFace model identifiers. Accepts either a bare +# repo name ('bert-base-uncased') or 'namespace/name' form, restricted to the +# character set HF itself uses and bounded in length. Rejects '..', backslash, +# colon, control chars, absolute paths, and anything that could be interpreted +# as a local filesystem path by AutoConfig.from_pretrained (which silently +# loads from disk when given a path that exists). +_SAFE_MODEL_ID_RE = re.compile(r'^[A-Za-z0-9][A-Za-z0-9._-]{0,127}(/[A-Za-z0-9._-]{1,128})?$') + + +def validate_model_identifier(model_identifier: Optional[str]) -> str: + """Validate a HuggingFace model identifier against a strict allow-list. + + Args: + model_identifier: The identifier to validate (typically from CLI input). + + Returns: + The validated identifier (unchanged) for convenient inline use. + + Raises: + ValueError: If the identifier is missing or does not match the + permitted ``[namespace/]name`` shape. The check intentionally + rejects path-traversal sequences and characters that could let + ``from_pretrained`` load attacker-staged files from disk. + """ + if not model_identifier or not _SAFE_MODEL_ID_RE.match(model_identifier): + raise ValueError( + f'Invalid model_identifier {model_identifier!r}. ' + 'Must be a HuggingFace repo id matching ' + "'^[A-Za-z0-9][A-Za-z0-9._-]{0,127}(/[A-Za-z0-9._-]{1,128})?$' " + '(e.g. "bert-base-uncased" or "meta-llama/Llama-2-7b-hf").' + ) + return model_identifier + class ModelLoadError(Exception): """Exception raised when model loading fails.""" @@ -46,16 +80,29 @@ class HuggingFaceModelLoader: Attributes: cache_dir: Directory to cache downloaded models. token: HuggingFace authentication token for private/gated models. + allow_remote_code: Whether to allow HuggingFace to download and execute + repository-provided Python (``trust_remote_code=True``). Default + ``False``; enabling this turns ``--model_identifier`` into an RCE + sink, so it is opt-in only. """ - def __init__(self, cache_dir: Optional[str] = None, token: Optional[str] = None): + def __init__( + self, + cache_dir: Optional[str] = None, + token: Optional[str] = None, + allow_remote_code: bool = False, + ): """Initialize the HuggingFace model loader. Args: cache_dir: Directory to cache downloaded models. If None, uses HF default. token: HuggingFace authentication token for private/gated models. + allow_remote_code: If True, allow execution of model-repo Python via + ``trust_remote_code=True``. Default False. Only enable for + trusted ``--model_identifier`` values; pin ``--revision ``. """ self.cache_dir = cache_dir or os.getenv('HF_HOME') or os.path.expanduser('~/.cache/huggingface') self.token = token or os.getenv('HF_TOKEN') or os.getenv('HUGGING_FACE_HUB_TOKEN') + self.allow_remote_code = bool(allow_remote_code) # Ensure cache directory exists Path(self.cache_dir).mkdir(parents=True, exist_ok=True) @@ -63,6 +110,11 @@ def __init__(self, cache_dir: Optional[str] = None, token: Optional[str] = None) logger.info(f'HuggingFaceModelLoader initialized with cache_dir: {self.cache_dir}') if self.token: logger.info('Authentication token provided for private/gated models (token not logged)') + if self.allow_remote_code: + logger.warning( + 'allow_remote_code=True: HuggingFace may download and execute arbitrary Python ' + 'from model repositories. Only enable for trusted model identifiers; pin --revision.' + ) def load_model( self, @@ -94,6 +146,9 @@ def load_model( """ logger.info(f'Loading model: {model_identifier}') + # Reject malformed / path-like identifiers before any network or disk activity. + validate_model_identifier(model_identifier) + try: # Convert torch_dtype string to torch dtype dtype = self._get_torch_dtype(torch_dtype) if torch_dtype else None @@ -112,7 +167,9 @@ def load_model( # Load config (use pre-downloaded config if provided) if config is None: logger.info('Loading model configuration...') - config = AutoConfig.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs) + config = AutoConfig.from_pretrained( + model_identifier, trust_remote_code=self.allow_remote_code, **load_kwargs + ) else: logger.info('Using pre-downloaded model configuration.') @@ -120,14 +177,16 @@ def load_model( tokenizer = None try: logger.info('Loading tokenizer...') - tokenizer = AutoTokenizer.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs) + tokenizer = AutoTokenizer.from_pretrained( + model_identifier, trust_remote_code=self.allow_remote_code, **load_kwargs + ) except Exception as e: logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.') # Load model logger.info(f'Loading model weights (dtype={torch_dtype}, device={device})...') model_kwargs = load_kwargs.copy() - model_kwargs['trust_remote_code'] = True + model_kwargs['trust_remote_code'] = self.allow_remote_code # Handle device mapping for large models effective_device_map = device_map @@ -202,16 +261,31 @@ def load_model_from_config( if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' - # Extract loading parameters - return self.load_model( - model_identifier=config.identifier, - torch_dtype=config.torch_dtype, - device=device, - revision=config.revision, - device_map=config.device_map, - config=config_pretrained, - **config.additional_kwargs - ) + # Honor explicit per-call hf_token / cache_dir from the config without permanently + # mutating the loader instance. This makes ModelSourceConfig the single source of + # truth for callers that don't rely on HF_TOKEN / HF_HOME env vars. + original_token = self.token + original_cache_dir = self.cache_dir + try: + if config.hf_token: + self.token = config.hf_token + if config.cache_dir: + self.cache_dir = config.cache_dir + Path(self.cache_dir).mkdir(parents=True, exist_ok=True) + + # Extract loading parameters + return self.load_model( + model_identifier=config.identifier, + torch_dtype=config.torch_dtype, + device=device, + revision=config.revision, + device_map=config.device_map, + config=config_pretrained, + **config.additional_kwargs + ) + finally: + self.token = original_token + self.cache_dir = original_cache_dir def _get_torch_dtype(self, dtype_str: str) -> torch.dtype: """Convert dtype string to torch.dtype. @@ -242,7 +316,7 @@ def _get_torch_dtype(self, dtype_str: str) -> torch.dtype: } if normalized_dtype not in dtype_map: - raise ValueError(f"Invalid dtype '{dtype_str}'.Must be one of {list(dtype_map.keys())}") + raise ValueError(f"Invalid dtype '{dtype_str}'. Must be one of {list(dtype_map.keys())}") return dtype_map[normalized_dtype] diff --git a/superbench/benchmarks/micro_benchmarks/model_source_config.py b/superbench/benchmarks/micro_benchmarks/model_source_config.py index 99ca31870..b141e6a21 100644 --- a/superbench/benchmarks/micro_benchmarks/model_source_config.py +++ b/superbench/benchmarks/micro_benchmarks/model_source_config.py @@ -47,8 +47,10 @@ def __post_init__(self): if self.source not in ['in-house', 'huggingface']: raise ValueError(f"Invalid model source '{self.source}'. Must be 'in-house' or 'huggingface'.") - # Validate torch_dtype - valid_dtypes = ['float32', 'float16', 'bfloat16', 'int8'] + # Validate torch_dtype. NOTE: 'int8' is intentionally excluded here — it is handled + # post-export via quantize_dynamic (see ort_inference_performance.py) rather than via + # the HF torch_dtype loading path, which does not accept torch.int8. + valid_dtypes = ['float32', 'float16', 'bfloat16'] if self.torch_dtype not in valid_dtypes: raise ValueError(f"Invalid torch_dtype '{self.torch_dtype}'. Must be one of {valid_dtypes}.") diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py index 2e0fff826..7e74e62c1 100644 --- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py @@ -12,10 +12,13 @@ import numpy as np from superbench.common.utils import logger -from superbench.benchmarks import BenchmarkRegistry, Platform, Precision +from superbench.benchmarks import BenchmarkRegistry, Platform, Precision, ReturnCode from superbench.benchmarks.micro_benchmarks import MicroBenchmark from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig -from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import ( + HuggingFaceModelLoader, + validate_model_identifier, +) class ORTInferenceBenchmark(MicroBenchmark): @@ -42,6 +45,9 @@ def __init__(self, name, parameters=''): ] self.__graph_opt_level = None self.__model_cache_path = Path(torch.hub.get_dir()) / 'checkpoints' + # Stashed HF config (populated in _preprocess_huggingface_models) so that + # __inference() can derive vocab_size / dynamic input shapes from it. + self._hf_config = None def add_parser_arguments(self): """Add the specified arguments.""" @@ -124,6 +130,24 @@ def add_parser_arguments(self): help='Sequence length for transformer models.', ) + self._parser.add_argument( + '--require_cuda', + action='store_true', + default=False, + required=False, + help='Fail if CUDAExecutionProvider is not available. ' + 'Default: warn and fall back to other registered ORT providers (CPU/ROCm/etc.).', + ) + + self._parser.add_argument( + '--allow_remote_code', + action='store_true', + default=False, + required=False, + help='Allow HuggingFace to execute model-repo Python (trust_remote_code=True). ' + 'SECURITY: enables RCE from --model_identifier. Pin --revision when used.', + ) + def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -179,8 +203,19 @@ def _preprocess_huggingface_models(self): if not self._args.model_identifier: logger.error('--model_identifier is required when using --model_source huggingface') + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False + # Reject malformed / path-like identifiers up front, before any network or disk activity. + try: + validate_model_identifier(self._args.model_identifier) + except ValueError as e: + logger.error(str(e)) + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) + return False + + allow_remote_code = bool(getattr(self._args, 'allow_remote_code', False)) + try: logger.info(f'Loading HuggingFace model: {self._args.model_identifier}') @@ -190,13 +225,18 @@ def _preprocess_huggingface_models(self): load_kwargs = {} if hf_token: load_kwargs['token'] = hf_token - hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs) + hf_config = AutoConfig.from_pretrained( + self._args.model_identifier, trust_remote_code=allow_remote_code, **load_kwargs + ) + # Stash for __inference() to read vocab_size / other model metadata later. + self._hf_config = hf_config precision_str = self._args.precision.value if self._args.precision != Precision.INT8 else 'float32' fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits( self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token ) if not fits: + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False # Step 2: Proceed with model download and ONNX export @@ -217,7 +257,7 @@ def _preprocess_huggingface_models(self): ) # Load model from HuggingFace on CPU - loader = HuggingFaceModelLoader() + loader = HuggingFaceModelLoader(allow_remote_code=allow_remote_code) hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu') from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter exporter = torch2onnxExporter() @@ -237,6 +277,15 @@ def _preprocess_huggingface_models(self): export_precision = self._args.precision.value model_name_with_precision = f'{model_name}.{export_precision}' + # Defense-in-depth: confirm the resolved output path stays inside the rank + # directory even though validate_model_identifier already rejected '..' / '\\'. + proc_root = proc_output_path.resolve() + resolved_out = (proc_output_path / f'{model_name_with_precision}.onnx').resolve() + if proc_root not in resolved_out.parents: + logger.error(f'Refusing to write ONNX outside rank dir: {resolved_out} not under {proc_root}') + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) + return False + # Export directly to final destination to avoid path issues with external data onnx_path = exporter.export_huggingface_model( model=hf_model, @@ -248,6 +297,7 @@ def _preprocess_huggingface_models(self): if not onnx_path: logger.error(f'Failed to export {self._args.model_identifier} to ONNX') + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False # Apply INT8 quantization if requested (matching in-house model behavior) @@ -268,6 +318,7 @@ def _preprocess_huggingface_models(self): logger.error(f'Failed to prepare HuggingFace model: {str(e)}') import traceback logger.error(traceback.format_exc()) + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False def _benchmark(self): @@ -275,18 +326,24 @@ def _benchmark(self): import onnxruntime as ort precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8'} - # Require CUDAExecutionProvider — this benchmark targets GPU inference available = ort.get_available_providers() - if 'CUDAExecutionProvider' not in available: - logger.error(f'CUDAExecutionProvider is not available (available: {available}).') - return False + cuda_available = 'CUDAExecutionProvider' in available + if not cuda_available: + msg = f'CUDAExecutionProvider is not available (available providers: {available}).' + if getattr(self._args, 'require_cuda', False): + logger.error(msg + ' --require_cuda was set, aborting.') + return False + logger.warning( + msg + ' Falling back to registered providers; pass --require_cuda to fail instead.' + ) + providers = ['CUDAExecutionProvider'] if cuda_available else available for model in self._args.pytorch_models: sess_options = ort.SessionOptions() sess_options.graph_optimization_level = self.__graph_opt_level[self._args.graph_opt_level] file_name = '{model}.{precision}.onnx'.format(model=model, precision=self._args.precision) ort_sess = ort.InferenceSession( - f'{self.__model_cache_path / file_name}', sess_options, providers=['CUDAExecutionProvider'] + f'{self.__model_cache_path / file_name}', sess_options, providers=providers ) elapse_times = self.__inference(ort_sess) @@ -318,18 +375,30 @@ def __inference(self, ort_sess): """ precision = np.float16 if self._args.precision == Precision.FLOAT16 else np.float32 - # Get input names from the ONNX session to determine input format - input_names = [input.name for input in ort_sess.get_inputs()] + # Get input metadata from the ONNX session to determine input format and shapes + ort_inputs = ort_sess.get_inputs() + input_names = [inp.name for inp in ort_inputs] # Determine input format based on what the model expects if 'pixel_values' in input_names: - # Vision model: use pixel_values (batch_size, 3, 224, 224) - pixel_values = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision) + # Vision model: derive (C, H, W) from the exported ONNX graph so that models + # with non-default shapes (e.g. 384x384 ViT, 1-channel medical models) work. + # Fall back to (3, 224, 224) only for dynamic / unknown axes. + meta = next(inp for inp in ort_inputs if inp.name == 'pixel_values') + dims = [d if isinstance(d, int) else None for d in (meta.shape or [])] + # Expected layout is (N, C, H, W); pad to length 4 if shorter. + dims = (dims + [None] * 4)[:4] + _, c, h, w = dims + c, h, w = c or 3, h or 224, w or 224 + pixel_values = np.random.randn(self._args.batch_size, c, h, w).astype(dtype=precision) inputs = {'pixel_values': pixel_values} elif 'input_ids' in input_names: - # NLP model: use input_ids and attention_mask + # NLP model: use input_ids and attention_mask. Cap token IDs at the model's + # actual vocab_size to avoid out-of-range embedding lookups (undefined behavior + # on CUDA — silent NaNs / device-side asserts). seq_len = getattr(self._args, 'seq_length', 512) - input_ids = np.random.randint(0, 30000, (self._args.batch_size, seq_len)).astype(np.int64) + vocab_size = getattr(self._hf_config, 'vocab_size', None) or 30000 + input_ids = np.random.randint(0, vocab_size, (self._args.batch_size, seq_len)).astype(np.int64) attention_mask = np.ones((self._args.batch_size, seq_len), dtype=np.int64) inputs = {'input_ids': input_ids, 'attention_mask': attention_mask} else: diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py index 5153073a3..737358aaa 100644 --- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py @@ -5,6 +5,7 @@ import os import re +import subprocess from pathlib import Path import torch @@ -14,7 +15,10 @@ from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig -from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import ( + HuggingFaceModelLoader, + validate_model_identifier, +) class TensorRTInferenceBenchmark(MicroBenchmarkWithInvoke): @@ -94,6 +98,47 @@ def add_parser_arguments(self): help='Model identifier for HuggingFace models (e.g., bert-base-uncased).', ) + self._parser.add_argument( + '--allow_remote_code', + action='store_true', + default=False, + required=False, + help='Allow HuggingFace to execute model-repo Python (trust_remote_code=True). ' + 'SECURITY: enables RCE from --model_identifier. Pin --revision when used.', + ) + + @staticmethod + def __detect_workspace_flag(bin_path: str) -> str: + """Return the trtexec workspace flag supported by the installed binary. + + Args: + bin_path: Absolute path to the trtexec binary. + + Returns: + ``'--memPoolSize=workspace:8192M'`` on TensorRT >= 8.4, + ``'--workspace=8192'`` on older runtimes or when probing fails. + """ + modern = '--memPoolSize=workspace:8192M' + legacy = '--workspace=8192' + try: + proc = subprocess.run( + [bin_path, '--help'], capture_output=True, text=True, timeout=10, check=False + ) + help_text = (proc.stdout or '') + (proc.stderr or '') + if '--memPoolSize' in help_text: + return modern + logger.warning( + 'trtexec at %s does not advertise --memPoolSize; falling back to --workspace=8192 ' + '(TensorRT < 8.4 detected).', bin_path + ) + return legacy + except (OSError, subprocess.SubprocessError) as e: + logger.warning( + 'Could not probe trtexec at %s for --memPoolSize support (%s); using --workspace=8192.', + bin_path, e, + ) + return legacy + def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -104,6 +149,11 @@ def _preprocess(self): return False self.__bin_path = str(Path(self._args.bin_dir) / self._bin_name) + # Pick the right workspace flag for the installed trtexec. --memPoolSize was + # introduced in TensorRT 8.4; older runtimes (TRT 8.0-8.3, still found in + # some CUDA 11.x base images) only accept the deprecated-but-still-supported + # --workspace=. Probe once here and reuse for every model. + self.__workspace_flag = self.__detect_workspace_flag(self.__bin_path) # Handle HuggingFace models if specified if self._args.model_source == 'huggingface': @@ -131,7 +181,7 @@ def _preprocess(self): f'--onnx={onnx_model}', # build options f'--optShapes=input:{input_shape}', - '--memPoolSize=workspace:8192M', + self.__workspace_flag, None if self._args.precision == 'fp32' else f'--{self._args.precision}', # inference options f'--iterations={self._args.iterations}', @@ -148,11 +198,34 @@ def _preprocess_huggingface_models(self): Returns: bool: True if preprocessing succeeds. """ - import os from transformers import AutoConfig if not self._args.model_identifier: logger.error('--model_identifier is required when using --model_source huggingface') + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) + return False + + # Reject malformed / path-like identifiers up front, before any network or disk activity. + try: + validate_model_identifier(self._args.model_identifier) + except ValueError as e: + logger.error(str(e)) + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) + return False + + allow_remote_code = bool(getattr(self._args, 'allow_remote_code', False)) + + # Reject INT8 on the HuggingFace path: the current pipeline emits `--int8` to + # trtexec without `--calib=` and without a Q/DQ-embedded ONNX, so trtexec + # would fall back to fake dynamic ranges and report misleading latencies. + if str(getattr(self._args, 'precision', '')).lower() == 'int8': + logger.error( + 'TensorRT --precision int8 on HuggingFace models is not supported: ' + 'no calibration data / Q-DQ ONNX is generated, so reported latencies ' + 'would not represent a correctly-calibrated INT8 engine. ' + 'Use --precision fp16 or fp32, or run ORT INT8 quantization first.' + ) + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False try: @@ -163,12 +236,15 @@ def _preprocess_huggingface_models(self): if hf_token: load_kwargs['token'] = hf_token - hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs) + hf_config = AutoConfig.from_pretrained( + self._args.model_identifier, trust_remote_code=allow_remote_code, **load_kwargs + ) precision_str = self._args.precision # already a string: 'fp16', 'fp32', 'int8' fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits( self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token ) if not fits: + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False # Step 2: Download and load the full model @@ -193,7 +269,7 @@ def _preprocess_huggingface_models(self): logger.info(f'Loading HuggingFace model: {self._args.model_identifier}') # Load model from HuggingFace on CPU - loader = HuggingFaceModelLoader() + loader = HuggingFaceModelLoader(allow_remote_code=allow_remote_code) hf_model, hf_config, _ = loader.load_model_from_config(model_config, device='cpu') self._hf_config = hf_config exporter = torch2onnxExporter() @@ -205,6 +281,15 @@ def _preprocess_huggingface_models(self): output_dir = str(Path(torch.hub.get_dir()) / 'checkpoints' / f'trt_rank_{proc_rank}') os.makedirs(output_dir, exist_ok=True) + # Defense-in-depth: confirm resolved output path stays inside the rank directory + # even though validate_model_identifier already rejected '..' / '\\' / control chars. + proc_root = Path(output_dir).resolve() + resolved_out = (Path(output_dir) / f'{model_name}.onnx').resolve() + if proc_root not in resolved_out.parents: + logger.error(f'Refusing to write ONNX outside rank dir: {resolved_out} not under {proc_root}') + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) + return False + onnx_path = exporter.export_huggingface_model( model=hf_model, model_name=model_name, @@ -215,11 +300,15 @@ def _preprocess_huggingface_models(self): if not onnx_path: logger.error(f'Failed to export {self._args.model_identifier} to ONNX') + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False - # Determine input shape based on model type by checking ONNX file + # Determine input shape based on model type by checking ONNX file. + # Pass load_external_data=False because we only need graph input metadata; + # the default True would materialize all sidecar tensors and OOM on the + # >2GB external-data models that this branch was written for. import onnx as onnx_lib - onnx_model = onnx_lib.load(onnx_path) + onnx_model = onnx_lib.load(onnx_path, load_external_data=False) # Filter out initializers from graph.input to get only runtime inputs initializer_names = {init.name for init in onnx_model.graph.initializer} @@ -277,7 +366,7 @@ def _preprocess_huggingface_models(self): self.__bin_path, f'--onnx={onnx_path}', f'--optShapes={input_shapes}', - '--memPoolSize=workspace:8192M', + self.__workspace_flag, None if self._args.precision == 'fp32' else f'--{self._args.precision}', f'--iterations={self._args.iterations}', '--percentile=99', @@ -294,6 +383,7 @@ def _preprocess_huggingface_models(self): logger.error(f'Failed to prepare HuggingFace model: {str(e)}') import traceback logger.error(traceback.format_exc()) + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False def _process_raw_result(self, cmd_idx, raw_output): diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py index 55c378500..22061bf7c 100644 --- a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py +++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py @@ -20,13 +20,16 @@ from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig -@pytest.mark.skipif(os.environ.get('SB_TEST_HF_E2E', '0') != '1', reason='Skip HF E2E tests. Set SB_TEST_HF_E2E=1 to enable.') +@pytest.mark.skipif( + os.environ.get('SB_TEST_HF_E2E', '0') != '1', + reason='Skip HF E2E tests. Set SB_TEST_HF_E2E=1 to enable.', +) class TestHuggingFaceE2E: """End-to-end tests for HuggingFace model loading.""" @pytest.fixture - def loader(self): - """Create a loader instance.""" - return HuggingFaceModelLoader(cache_dir='/tmp/hf_test_cache') + def loader(self, tmp_path): + """Create a loader instance with an isolated per-test cache dir.""" + return HuggingFaceModelLoader(cache_dir=str(tmp_path / 'hf_cache')) def test_load_tiny_bert_model(self, loader): """Test loading a tiny BERT model from HuggingFace Hub. diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py index e679fb068..0d32c9ea4 100644 --- a/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py +++ b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py @@ -71,16 +71,18 @@ def test_load_model_success(self, mock_tokenizer, mock_config, mock_model, loade assert config == mock_cfg assert tokenizer == mock_tok - # Verify mocks were called with correct arguments + # Verify mocks were called with correct arguments. trust_remote_code must + # default to False (matches loader.allow_remote_code=False) so that arbitrary + # repo Python is not executed unless the caller explicitly opts in. mock_config.from_pretrained.assert_called_once() call_kwargs = mock_config.from_pretrained.call_args assert call_kwargs[0][0] == 'test/model' - assert call_kwargs[1]['trust_remote_code'] is True + assert call_kwargs[1]['trust_remote_code'] is False assert call_kwargs[1]['cache_dir'] == loader.cache_dir mock_model.from_pretrained.assert_called_once() model_call_kwargs = mock_model.from_pretrained.call_args - assert model_call_kwargs[1]['trust_remote_code'] is True + assert model_call_kwargs[1]['trust_remote_code'] is False assert model_call_kwargs[1]['cache_dir'] == loader.cache_dir mock_tokenizer.from_pretrained.assert_called_once() diff --git a/tests/helper/decorator.py b/tests/helper/decorator.py index 8d0ad314b..ff08469ac 100644 --- a/tests/helper/decorator.py +++ b/tests/helper/decorator.py @@ -13,7 +13,6 @@ pytorch_test = unittest.skipIf(os.environ.get('SB_TEST_PYTORCH', '1') == '0', 'Skip PyTorch tests.') directx_test = unittest.skipIf(os.environ.get('SB_TEST_DIRECTX', '0') == '0', 'Skip DirectX tests.') -hf_e2e_test = unittest.skipUnless(os.environ.get('SB_TEST_HF_E2E', '0') == '1', 'Skip HF E2E tests. Set SB_TEST_HF_E2E=1 to enable.') def load_data(filepath): From 18f13ef4b9b8f05644b5ee79ebe8b80b325766bf Mon Sep 17 00:00:00 2001 From: Aishwarya Tonpe Date: Tue, 2 Jun 2026 10:29:13 -0700 Subject: [PATCH 03/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../micro_benchmarks/tensorrt_inference_performance.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py index b00791aec..717d7aa0d 100644 --- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py @@ -358,6 +358,8 @@ def _derive_trt_input_shapes(self, onnx_path): # Filter out initializers from graph.input to get only runtime inputs initializer_names = {init.name for init in onnx_model.graph.initializer} runtime_inputs = [inp for inp in onnx_model.graph.input if inp.name not in initializer_names] + if not runtime_inputs: + raise ValueError(f'No runtime inputs found in exported ONNX model: {onnx_path}') # Get the first runtime input to determine shape and name input_name = runtime_inputs[0].name From 83a533ff6a73b6fbfcc4573a4e2e7512ae1acc80 Mon Sep 17 00:00:00 2001 From: Aishwarya Tonpe Date: Tue, 2 Jun 2026 10:29:51 -0700 Subject: [PATCH 04/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../micro_benchmarks/tensorrt_inference_performance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py index 717d7aa0d..06d1dd6e0 100644 --- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py @@ -239,7 +239,7 @@ def _preprocess_huggingface_models(self): self._args.model_identifier, trust_remote_code=allow_remote_code, **load_kwargs ) precision_str = self._args.precision # already a string: 'fp16', 'fp32', 'int8' - fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits( + fits, _, _, _ = HuggingFaceModelLoader.check_memory_fits( self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token ) if not fits: From 44da2e13402c521eaacb88a550a23a3967c666ec Mon Sep 17 00:00:00 2001 From: Aishwarya Tonpe Date: Tue, 2 Jun 2026 10:30:31 -0700 Subject: [PATCH 05/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../benchmarks/micro_benchmarks/ort_inference_performance.py | 1 + 1 file changed, 1 insertion(+) diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py index 8cbf269df..644d964f8 100644 --- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py @@ -345,6 +345,7 @@ def _benchmark(self): msg = f'CUDAExecutionProvider is not available (available providers: {available}).' if getattr(self._args, 'require_cuda', False): logger.error(msg + ' --require_cuda was set, aborting.') + self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False logger.warning(msg + ' Falling back to registered providers; pass --require_cuda to fail instead.') providers = ['CUDAExecutionProvider'] if cuda_available else available From 864a8e9cdf8e18bccdff0cd7143f4f09a4edb6ce Mon Sep 17 00:00:00 2001 From: Aishwarya Tonpe Date: Tue, 2 Jun 2026 10:30:59 -0700 Subject: [PATCH 06/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- tests/benchmarks/micro_benchmarks/test_model_source_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmarks/micro_benchmarks/test_model_source_config.py b/tests/benchmarks/micro_benchmarks/test_model_source_config.py index 9d9f7f35e..81cb7d0fa 100644 --- a/tests/benchmarks/micro_benchmarks/test_model_source_config.py +++ b/tests/benchmarks/micro_benchmarks/test_model_source_config.py @@ -36,7 +36,7 @@ def test_invalid_dtype(self): def test_missing_identifier(self): """Test missing identifier raises error.""" - with pytest.raises(ValueError, match='identifier must be provided'): + with pytest.raises(ValueError, match='Model identifier must be provided'): ModelSourceConfig(identifier='') def test_validate_huggingface_empty(self): From 54e4153a6d44e011fd76cdff02b11f5e7e47dac8 Mon Sep 17 00:00:00 2001 From: Aishwarya Tonpe Date: Tue, 2 Jun 2026 10:31:14 -0700 Subject: [PATCH 07/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../micro_benchmarks/_export_torch_to_onnx.py | 191 ++++++++++-------- .../huggingface_model_loader.py | 89 ++++---- .../ort_inference_performance.py | 2 +- .../micro_benchmarks/test_huggingface_e2e.py | 7 +- 4 files changed, 159 insertions(+), 130 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py index ab94f74e7..d395d18e0 100644 --- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py +++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py @@ -18,6 +18,7 @@ class torch2onnxExporter(): """PyTorch model to ONNX exporter.""" + def __init__(self): """Constructor.""" from transformers import BertConfig, GPT2Config, LlamaConfig @@ -314,95 +315,15 @@ def export_huggingface_model(self, model, model_name, batch_size=1, seq_length=5 is_vision_model = main_input == 'pixel_values' if is_vision_model: - # Vision models: use pixel_values (batch_size, channels, height, width) - # Derive C/H/W from model config rather than hard-coding 3x224x224 - num_channels = getattr(model.config, 'num_channels', 3) - image_size = getattr(model.config, 'image_size', 224) - if isinstance(image_size, (list, tuple)): - img_h, img_w = image_size[0], image_size[1] - else: - img_h, img_w = image_size, image_size - - dummy_input = torch.randn(batch_size, num_channels, img_h, img_w, dtype=model_dtype, device=device) - input_names = ['pixel_values'] - dynamic_axes = {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}} - - # Wrapper for vision models - class VisionModelWrapper(torch.nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - - def forward(self, pixel_values): - outputs = self.model(pixel_values=pixel_values) - if hasattr(outputs, 'logits'): - return outputs.logits - elif hasattr(outputs, 'last_hidden_state'): - return outputs.last_hidden_state - else: - return outputs[0] if isinstance(outputs, (tuple, list)) else outputs - - wrapped_model = VisionModelWrapper(model) - export_args = (dummy_input, ) + wrapped_model, export_args, input_names, dynamic_axes = self._build_vision_export_inputs( + model, batch_size, model_dtype, device + ) else: - # NLP models: use input_ids and attention_mask - dummy_input = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device) - attention_mask = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device) - input_names = ['input_ids', 'attention_mask'] - dynamic_axes = { - 'input_ids': { - 0: 'batch_size', - 1: 'seq_length' - }, - 'attention_mask': { - 0: 'batch_size', - 1: 'seq_length' - }, - 'output': { - 0: 'batch_size', - 1: 'seq_length' - }, - } + wrapped_model, export_args, input_names, dynamic_axes = self._build_nlp_export_inputs( + model, batch_size, seq_length, device + ) - # Wrapper for NLP models - class NLPModelWrapper(torch.nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - - def forward(self, input_ids, attention_mask): - outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) - if hasattr(outputs, 'logits'): - return outputs.logits - elif hasattr(outputs, 'last_hidden_state'): - return outputs.last_hidden_state - else: - return outputs[0] if isinstance(outputs, (tuple, list)) else outputs - - wrapped_model = NLPModelWrapper(model) - export_args = (dummy_input, attention_mask) - - # Export to ONNX for large models (>2GB), use external data format - model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3) - use_external_data = model_size_gb > 2.0 - - if use_external_data: - logger.info(f'Model size is {model_size_gb:.2f}GB, using external data format for ONNX export') - - export_kwargs = { - 'opset_version': 14, - 'do_constant_folding': True, - 'input_names': input_names, - 'output_names': ['output'], - 'dynamic_axes': dynamic_axes, - } - if use_external_data: - # PyTorch 2.8+ renamed 'use_external_data_format' to 'external_data' - sig = inspect.signature(torch.onnx.export) - if 'external_data' in sig.parameters: - export_kwargs['external_data'] = True - else: - export_kwargs['use_external_data_format'] = True + export_kwargs = self._build_onnx_export_kwargs(model, input_names, dynamic_axes) torch.onnx.export( wrapped_model, @@ -412,7 +333,7 @@ def forward(self, input_ids, attention_mask): ) # Clean up - del dummy_input + del export_args if torch.cuda.is_available(): torch.cuda.empty_cache() @@ -422,3 +343,97 @@ def forward(self, input_ids, attention_mask): logger.error(f'Failed to export HuggingFace model to ONNX: {str(e)}') logger.error(traceback.format_exc()) return '' + + def _build_vision_export_inputs(self, model, batch_size, model_dtype, device): + """Build the dummy inputs and wrapper module for exporting a vision HuggingFace model.""" + # Vision models: use pixel_values (batch_size, channels, height, width) + # Derive C/H/W from model config rather than hard-coding 3x224x224 + num_channels = getattr(model.config, 'num_channels', 3) + image_size = getattr(model.config, 'image_size', 224) + if isinstance(image_size, (list, tuple)): + img_h, img_w = image_size[0], image_size[1] + else: + img_h, img_w = image_size, image_size + + dummy_input = torch.randn(batch_size, num_channels, img_h, img_w, dtype=model_dtype, device=device) + input_names = ['pixel_values'] + dynamic_axes = {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}} + + class VisionModelWrapper(torch.nn.Module): + + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, pixel_values): + outputs = self.model(pixel_values=pixel_values) + if hasattr(outputs, 'logits'): + return outputs.logits + elif hasattr(outputs, 'last_hidden_state'): + return outputs.last_hidden_state + else: + return outputs[0] if isinstance(outputs, (tuple, list)) else outputs + + return VisionModelWrapper(model), (dummy_input, ), input_names, dynamic_axes + + def _build_nlp_export_inputs(self, model, batch_size, seq_length, device): + """Build the dummy inputs and wrapper module for exporting an NLP HuggingFace model.""" + # NLP models: use input_ids and attention_mask + dummy_input = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device) + attention_mask = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device) + input_names = ['input_ids', 'attention_mask'] + dynamic_axes = { + 'input_ids': { + 0: 'batch_size', + 1: 'seq_length' + }, + 'attention_mask': { + 0: 'batch_size', + 1: 'seq_length' + }, + 'output': { + 0: 'batch_size', + 1: 'seq_length' + }, + } + + class NLPModelWrapper(torch.nn.Module): + + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, input_ids, attention_mask): + outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) + if hasattr(outputs, 'logits'): + return outputs.logits + elif hasattr(outputs, 'last_hidden_state'): + return outputs.last_hidden_state + else: + return outputs[0] if isinstance(outputs, (tuple, list)) else outputs + + return NLPModelWrapper(model), (dummy_input, attention_mask), input_names, dynamic_axes + + def _build_onnx_export_kwargs(self, model, input_names, dynamic_axes): + """Assemble torch.onnx.export kwargs, enabling external-data format for >2GB models.""" + model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3) + use_external_data = model_size_gb > 2.0 + + if use_external_data: + logger.info(f'Model size is {model_size_gb:.2f}GB, using external data format for ONNX export') + + export_kwargs = { + 'opset_version': 14, + 'do_constant_folding': True, + 'input_names': input_names, + 'output_names': ['output'], + 'dynamic_axes': dynamic_axes, + } + if use_external_data: + # PyTorch 2.8+ renamed 'use_external_data_format' to 'external_data' + sig = inspect.signature(torch.onnx.export) + if 'external_data' in sig.parameters: + export_kwargs['external_data'] = True + else: + export_kwargs['use_external_data_format'] = True + return export_kwargs diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py index c7f28a2eb..c72f598cd 100644 --- a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py +++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py @@ -85,6 +85,7 @@ class HuggingFaceModelLoader: ``False``; enabling this turns ``--model_identifier`` into an RCE sink, so it is opt-in only. """ + def __init__( self, cache_dir: Optional[str] = None, @@ -150,19 +151,7 @@ def load_model( validate_model_identifier(model_identifier) try: - # Convert torch_dtype string to torch dtype - dtype = self._get_torch_dtype(torch_dtype) if torch_dtype else None - - # Prepare loading kwargs - load_kwargs = {'cache_dir': self.cache_dir, 'revision': revision, **kwargs} - - # Add token if available - if self.token: - load_kwargs['token'] = self.token - - # Add dtype if specified - if dtype: - load_kwargs['torch_dtype'] = dtype + load_kwargs = self._build_load_kwargs(torch_dtype, revision, kwargs) # Load config (use pre-downloaded config if provided) if config is None: @@ -173,35 +162,11 @@ def load_model( else: logger.info('Using pre-downloaded model configuration.') - # Load tokenizer (may fail for some models, that's ok) - tokenizer = None - try: - logger.info('Loading tokenizer...') - tokenizer = AutoTokenizer.from_pretrained( - model_identifier, trust_remote_code=self.allow_remote_code, **load_kwargs - ) - except Exception as e: - logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.') + tokenizer = self._try_load_tokenizer(model_identifier, load_kwargs) # Load model logger.info(f'Loading model weights (dtype={torch_dtype}, device={device})...') - model_kwargs = load_kwargs.copy() - model_kwargs['trust_remote_code'] = self.allow_remote_code - - # Handle device mapping for large models - effective_device_map = device_map - if device_map: - model_kwargs['device_map'] = device_map - elif device == 'cuda' and torch.cuda.is_available(): - # Don't set device_map if device is explicitly cuda - pass - elif device != 'cpu': - model_kwargs['device_map'] = device - effective_device_map = device - - # Pass pre-downloaded config to from_pretrained so any overrides take effect - if config is not None: - model_kwargs['config'] = config + model_kwargs, effective_device_map = self._build_model_kwargs(load_kwargs, device, device_map, config) try: model = AutoModel.from_pretrained(model_identifier, **model_kwargs) @@ -230,6 +195,52 @@ def load_model( except Exception as e: raise ModelLoadError(f"Unexpected error loading model '{model_identifier}': {e}") from e + def _build_load_kwargs(self, torch_dtype, revision, extra_kwargs): + """Assemble the base ``from_pretrained`` kwargs (cache_dir, token, dtype, revision).""" + dtype = self._get_torch_dtype(torch_dtype) if torch_dtype else None + load_kwargs = {'cache_dir': self.cache_dir, 'revision': revision, **extra_kwargs} + if self.token: + load_kwargs['token'] = self.token + if dtype: + load_kwargs['torch_dtype'] = dtype + return load_kwargs + + def _try_load_tokenizer(self, model_identifier, load_kwargs): + """Attempt to load a tokenizer; return None if the model has no associated tokenizer.""" + try: + logger.info('Loading tokenizer...') + return AutoTokenizer.from_pretrained( + model_identifier, trust_remote_code=self.allow_remote_code, **load_kwargs + ) + except Exception as e: + logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.') + return None + + def _build_model_kwargs(self, load_kwargs, device, device_map, config): + """Build model-loading kwargs and resolve the effective device_map. + + Returns: + Tuple[dict, Optional[str]]: ``(model_kwargs, effective_device_map)``. + """ + model_kwargs = load_kwargs.copy() + model_kwargs['trust_remote_code'] = self.allow_remote_code + + effective_device_map = device_map + if device_map: + model_kwargs['device_map'] = device_map + elif device == 'cuda' and torch.cuda.is_available(): + # Don't set device_map if device is explicitly cuda + pass + elif device != 'cpu': + model_kwargs['device_map'] = device + effective_device_map = device + + # Pass pre-downloaded config to from_pretrained so any overrides take effect + if config is not None: + model_kwargs['config'] = config + + return model_kwargs, effective_device_map + def load_model_from_config( self, config: ModelSourceConfig, diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py index 644d964f8..7c97b2bc3 100644 --- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py @@ -232,7 +232,7 @@ def _preprocess_huggingface_models(self): self._hf_config = hf_config precision_str = self._args.precision.value if self._args.precision != Precision.INT8 else 'float32' - fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits( + fits, _, _, _ = HuggingFaceModelLoader.check_memory_fits( self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token ) if not fits: diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py index 22061bf7c..dc7cf7d62 100644 --- a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py +++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py @@ -16,8 +16,10 @@ pytest.importorskip('transformers') -from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader -from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig +# Imports below this point depend on `transformers` being available, so they +# must be deferred until after the `importorskip` call above. +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader # noqa: E402 +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig # noqa: E402 @pytest.mark.skipif( @@ -26,6 +28,7 @@ ) class TestHuggingFaceE2E: """End-to-end tests for HuggingFace model loading.""" + @pytest.fixture def loader(self, tmp_path): """Create a loader instance with an isolated per-test cache dir.""" From a5d845c89e0fc26b89b2590852af5399ff011240 Mon Sep 17 00:00:00 2001 From: Aishwarya Tonpe Date: Tue, 2 Jun 2026 10:55:55 -0700 Subject: [PATCH 08/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../benchmarks/micro_benchmarks/huggingface_model_loader.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py index c72f598cd..6e4a5511b 100644 --- a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py +++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py @@ -52,6 +52,10 @@ def validate_model_identifier(model_identifier: Optional[str]) -> str: "'^[A-Za-z0-9][A-Za-z0-9._-]{0,127}(/[A-Za-z0-9._-]{1,128})?$' " '(e.g. "bert-base-uncased" or "meta-llama/Llama-2-7b-hf").' ) + if Path(model_identifier).exists(): + raise ValueError( + f'Invalid model_identifier {model_identifier!r}. Refusing to treat an existing local path as a Hub repo id.' + ) return model_identifier From 09f14b27fdf32aa68725160aff6582a0fb7aba6d Mon Sep 17 00:00:00 2001 From: Aishwarya Tonpe Date: Tue, 2 Jun 2026 10:55:55 -0700 Subject: [PATCH 09/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../tensorrt_inference_performance.py | 6 ++++++ .../micro_benchmarks/_export_torch_to_onnx.py | 3 --- .../huggingface_model_loader.py | 20 ++++++++++++++++--- .../ort_inference_performance.py | 2 +- .../tensorrt_inference_performance.py | 7 ++++--- .../micro_benchmarks/test_huggingface_e2e.py | 1 - 6 files changed, 28 insertions(+), 11 deletions(-) diff --git a/examples/benchmarks/tensorrt_inference_performance.py b/examples/benchmarks/tensorrt_inference_performance.py index 4385a728e..1880ab242 100644 --- a/examples/benchmarks/tensorrt_inference_performance.py +++ b/examples/benchmarks/tensorrt_inference_performance.py @@ -91,6 +91,12 @@ def run_huggingface_benchmark(model_identifier, precision='fp16', batch_size=32, parser.add_argument('--iterations', type=int, default=2048) args = parser.parse_args() + if args.model_source == 'huggingface' and args.precision == 'int8': + parser.error( + '--precision int8 is not supported with --model_source huggingface ' + '(no calibration data / Q-DQ ONNX is generated). Use fp16 or fp32.' + ) + if args.model_source == 'huggingface': run_huggingface_benchmark( args.model_identifier, args.precision, args.batch_size, args.seq_length, args.iterations diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py index d395d18e0..28715db64 100644 --- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py +++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py @@ -18,7 +18,6 @@ class torch2onnxExporter(): """PyTorch model to ONNX exporter.""" - def __init__(self): """Constructor.""" from transformers import BertConfig, GPT2Config, LlamaConfig @@ -360,7 +359,6 @@ def _build_vision_export_inputs(self, model, batch_size, model_dtype, device): dynamic_axes = {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}} class VisionModelWrapper(torch.nn.Module): - def __init__(self, model): super().__init__() self.model = model @@ -398,7 +396,6 @@ def _build_nlp_export_inputs(self, model, batch_size, seq_length, device): } class NLPModelWrapper(torch.nn.Module): - def __init__(self, model): super().__init__() self.model = model diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py index 6e4a5511b..0937e40aa 100644 --- a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py +++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py @@ -89,7 +89,6 @@ class HuggingFaceModelLoader: ``False``; enabling this turns ``--model_identifier`` into an RCE sink, so it is opt-in only. """ - def __init__( self, cache_dir: Optional[str] = None, @@ -154,6 +153,11 @@ def load_model( # Reject malformed / path-like identifiers before any network or disk activity. validate_model_identifier(model_identifier) + # Fall back to CPU on hosts without CUDA so default device='cuda' callers don't fail. + if device == 'cuda' and not torch.cuda.is_available(): + logger.warning('CUDA not available; falling back to CPU.') + device = 'cpu' + try: load_kwargs = self._build_load_kwargs(torch_dtype, revision, kwargs) @@ -211,10 +215,12 @@ def _build_load_kwargs(self, torch_dtype, revision, extra_kwargs): def _try_load_tokenizer(self, model_identifier, load_kwargs): """Attempt to load a tokenizer; return None if the model has no associated tokenizer.""" + # Tokenizers don't accept model-only kwargs like torch_dtype/device_map; strip before passing. + tokenizer_kwargs = {k: v for k, v in load_kwargs.items() if k not in ('torch_dtype', 'device_map')} try: logger.info('Loading tokenizer...') return AutoTokenizer.from_pretrained( - model_identifier, trust_remote_code=self.allow_remote_code, **load_kwargs + model_identifier, trust_remote_code=self.allow_remote_code, **tokenizer_kwargs ) except Exception as e: logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.') @@ -373,7 +379,15 @@ def estimate_param_count_from_config(hf_config) -> Optional[int]: # Embeddings: token + (optional) position max_pos = getattr(hf_config, 'max_position_embeddings', 0) - has_pos_embed = getattr(hf_config, 'position_embedding_type', None) not in ('rotary', None) + pos_embed_type = getattr(hf_config, 'position_embedding_type', None) + # When position_embedding_type is missing/None, default to assuming learned + # position embeddings exist (common for BERT-style configs that omit the field). + # Only skip the term when the type is explicitly rotary, or the config clearly + # indicates RoPE/rotary via rope_theta/rotary_pct/rotary_emb_base. + uses_rotary = pos_embed_type == 'rotary' or any( + getattr(hf_config, attr, None) is not None for attr in ('rope_theta', 'rotary_pct', 'rotary_emb_base') + ) + has_pos_embed = not uses_rotary embed_params = vocab * hidden if has_pos_embed and max_pos > 0: embed_params += max_pos * hidden diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py index 7c97b2bc3..ee76def7d 100644 --- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py @@ -145,7 +145,7 @@ def add_parser_arguments(self): default=False, required=False, help='Allow HuggingFace to execute model-repo Python (trust_remote_code=True). ' - 'SECURITY: enables RCE from --model_identifier. Pin --revision when used.', + 'SECURITY: enables RCE from --model_identifier; only enable for trusted model identifiers.', ) def _preprocess(self): diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py index 06d1dd6e0..d467c7987 100644 --- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py @@ -104,7 +104,7 @@ def add_parser_arguments(self): default=False, required=False, help='Allow HuggingFace to execute model-repo Python (trust_remote_code=True). ' - 'SECURITY: enables RCE from --model_identifier. Pin --revision when used.', + 'SECURITY: enables RCE from --model_identifier; only enable for trusted model identifiers.', ) @staticmethod @@ -238,9 +238,10 @@ def _preprocess_huggingface_models(self): hf_config = AutoConfig.from_pretrained( self._args.model_identifier, trust_remote_code=allow_remote_code, **load_kwargs ) - precision_str = self._args.precision # already a string: 'fp16', 'fp32', 'int8' + # ONNX export is always done in float32 (see _build_trtexec_command_for_hf), so gate + # the pre-download check on fp32 memory regardless of the requested runtime precision. fits, _, _, _ = HuggingFaceModelLoader.check_memory_fits( - self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token + self._args.model_identifier, hf_config, 'fp32', mode='inference', token=hf_token ) if not fits: self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py index dc7cf7d62..3f2283bb5 100644 --- a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py +++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py @@ -28,7 +28,6 @@ ) class TestHuggingFaceE2E: """End-to-end tests for HuggingFace model loading.""" - @pytest.fixture def loader(self, tmp_path): """Create a loader instance with an isolated per-test cache dir.""" From 135f38159331f07d2f07ed8734b3c5d9a74fc1c7 Mon Sep 17 00:00:00 2001 From: Aishwarya Tonpe Date: Tue, 2 Jun 2026 10:55:55 -0700 Subject: [PATCH 10/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../huggingface_model_loader.py | 2 +- .../ort_inference_performance.py | 17 ++++++++++++---- .../tensorrt_inference_performance.py | 7 +++++++ .../micro_benchmarks/test_huggingface_e2e.py | 20 +++++++++++-------- 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py index 0937e40aa..ba8454df0 100644 --- a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py +++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py @@ -350,7 +350,7 @@ def _get_model_size(self, model: PreTrainedModel) -> float: Returns: Number of parameters in millions. """ - return float(sum(p.numel() for p in model.parameters())) / 1_000_000 + return float(sum(p.numel() for p in model.parameters())) / 1_000_000 # type: ignore[attr-defined] @staticmethod def estimate_param_count_from_config(hf_config) -> Optional[int]: diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py index ee76def7d..309c37b06 100644 --- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py @@ -23,6 +23,7 @@ class ORTInferenceBenchmark(MicroBenchmark): """ONNXRuntime inference micro-benchmark class.""" + def __init__(self, name, parameters=''): """Constructor. @@ -240,7 +241,8 @@ def _preprocess_huggingface_models(self): return False # Step 2: Export the model to ONNX (and quantize for INT8) on a per-rank path. - return self._export_hf_model_to_onnx(hf_token, allow_remote_code) + # Reuse the already-downloaded hf_config to avoid a redundant fetch in load_model_from_config. + return self._export_hf_model_to_onnx(hf_token, allow_remote_code, hf_config) except Exception as e: logger.error(f'Failed to prepare HuggingFace model: {str(e)}') @@ -249,12 +251,13 @@ def _preprocess_huggingface_models(self): self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False - def _export_hf_model_to_onnx(self, hf_token, allow_remote_code): + def _export_hf_model_to_onnx(self, hf_token, allow_remote_code, hf_config=None): """Download the HF model, export to ONNX, and apply INT8 quantization if requested. Args: hf_token (str | None): HuggingFace token, or None. allow_remote_code (bool): Whether to allow trust_remote_code on load. + hf_config: Pre-downloaded HF config to reuse; avoids a redundant fetch. Returns: bool: True on success; False (with return code set) on failure. @@ -277,9 +280,9 @@ def _export_hf_model_to_onnx(self, hf_token, allow_remote_code): device_map=None, ) - # Load model from HuggingFace on CPU + # Load model from HuggingFace on CPU, reusing the preloaded config when available. loader = HuggingFaceModelLoader(allow_remote_code=allow_remote_code) - hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu') + hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu', config_pretrained=hf_config) from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter exporter = torch2onnxExporter() @@ -320,6 +323,12 @@ def _export_hf_model_to_onnx(self, hf_token, allow_remote_code): self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False + # Release the torch model now that ONNX export is done; export_huggingface_model() may + # have moved it onto GPU, and we don't want it occupying VRAM during ORT session creation. + del hf_model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + # Apply INT8 quantization if requested (matching in-house model behavior) if self._args.precision == Precision.INT8: from onnxruntime.quantization import quantize_dynamic diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py index d467c7987..386e84dc0 100644 --- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py @@ -23,6 +23,7 @@ class TensorRTInferenceBenchmark(MicroBenchmarkWithInvoke): """TensorRT inference micro-benchmark class.""" + def __init__(self, name, parameters=''): """Constructor. @@ -321,6 +322,12 @@ def _build_trtexec_command_for_hf(self, hf_token, allow_remote_code): self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) return False + # Release the torch model now that ONNX export is done; export_huggingface_model() may + # have moved it onto GPU, and we don't want it holding VRAM while trtexec builds the engine. + del hf_model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + input_shapes = self._derive_trt_input_shapes(onnx_path) # Build TensorRT command with correct input name diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py index 3f2283bb5..d9922d72e 100644 --- a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py +++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py @@ -18,8 +18,12 @@ # Imports below this point depend on `transformers` being available, so they # must be deferred until after the `importorskip` call above. -from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader # noqa: E402 -from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig # noqa: E402 +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import ( # noqa: E402 + HuggingFaceModelLoader, +) +from superbench.benchmarks.micro_benchmarks.model_source_config import ( # noqa: E402 + ModelSourceConfig, +) @pytest.mark.skipif( @@ -38,7 +42,7 @@ def test_load_tiny_bert_model(self, loader): Uses prajjwal1/bert-tiny which is a small public BERT model (~17MB). """ - model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + model, config, _ = loader.load_model('prajjwal1/bert-tiny', device='cpu') assert model is not None assert config is not None @@ -55,7 +59,7 @@ def test_load_distilgpt2_model(self, loader): Uses distilbert/distilgpt2 which is a small public GPT-2 model (~82MB). """ - model, config, tokenizer = loader.load_model('distilbert/distilgpt2', device='cpu') + model, config, _ = loader.load_model('distilbert/distilgpt2', device='cpu') assert model is not None assert config is not None @@ -71,14 +75,14 @@ def test_load_model_from_config(self, loader): """Test loading model using ModelSourceConfig via load_model_from_config.""" config = ModelSourceConfig(source='huggingface', identifier='prajjwal1/bert-tiny', torch_dtype='float32') - model, hf_config, tokenizer = loader.load_model_from_config(config, device='cpu') + model, hf_config, _ = loader.load_model_from_config(config, device='cpu') assert model is not None assert hf_config.model_type == 'bert' def test_load_model_with_dtype(self, loader): """Test loading model and converting dtype after load.""" - model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + model, _, _ = loader.load_model('prajjwal1/bert-tiny', device='cpu') # Convert to float32 after loading model = model.float() @@ -90,7 +94,7 @@ def test_load_model_with_dtype(self, loader): @pytest.mark.skipif(not torch.cuda.is_available(), reason='Requires GPU') def test_load_model_to_gpu(self, loader): """Test loading model and moving to GPU.""" - model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + model, _, _ = loader.load_model('prajjwal1/bert-tiny', device='cpu') # Move to GPU manually model = model.cuda() @@ -101,7 +105,7 @@ def test_load_model_to_gpu(self, loader): def test_architecture_detection(self, loader): """Test that architecture is correctly detected from loaded model.""" - model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + _, config, _ = loader.load_model('prajjwal1/bert-tiny', device='cpu') # Architecture should be detected from config assert config.model_type is not None From c3744390ce6138c40aa4f8e862232ecab6120c41 Mon Sep 17 00:00:00 2001 From: Aishwarya Tonpe Date: Tue, 2 Jun 2026 10:55:55 -0700 Subject: [PATCH 11/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../test_tensorrt_inference_performance.py | 451 ++++++++++++++++++ 1 file changed, 451 insertions(+) diff --git a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py index 301a4a08d..441be7af1 100644 --- a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py +++ b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py @@ -6,6 +6,7 @@ import unittest from pathlib import Path from types import SimpleNamespace +from unittest.mock import MagicMock, patch from tests.helper import decorator from tests.helper.testcase import BenchmarkTestCase @@ -13,6 +14,31 @@ from superbench.benchmarks.result import BenchmarkResult +def _make_onnx_dim(value): + """Build an ONNX-graph-input-style dim mock that exposes ``dim_value``.""" + return SimpleNamespace(dim_value=value) + + +def _make_onnx_input(name, dims): + """Build an ONNX-graph-input mock with the given name and dim values. + + A ``dim_value`` of ``0`` mimics a dynamic/unknown dimension, matching how + ``onnx`` represents symbolic dims (``dim_param`` set, ``dim_value`` == 0). + """ + return SimpleNamespace( + name=name, + type=SimpleNamespace( + tensor_type=SimpleNamespace(shape=SimpleNamespace(dim=[_make_onnx_dim(d) for d in dims])) + ), + ) + + +def _make_onnx_model(inputs, initializer_names=()): + """Build an ONNX model mock with the given graph inputs and initializers.""" + initializers = [SimpleNamespace(name=n) for n in initializer_names] + return SimpleNamespace(graph=SimpleNamespace(input=list(inputs), initializer=initializers)) + + class TensorRTInferenceBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase): """Class for tensorrt-inferencee benchmark test cases.""" @classmethod @@ -144,3 +170,428 @@ def test_tensorrt_inference_result_parsing(self, test_raw_log_1, test_raw_log_2) # Negative case - invalid raw output self.assertFalse(benchmark._process_raw_result(1, 'Invalid raw output')) + + +_TENSORRT_MODULE = 'superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance' + + +class TensorRTInferenceHuggingFaceTestCase(unittest.TestCase): + """Unit tests for the HuggingFace-specific helpers on TensorRTInferenceBenchmark. + + These tests exercise the methods that previously had zero coverage: + ``_preprocess_huggingface_models``, ``_build_trtexec_command_for_hf``, + ``_derive_trt_input_shapes``, ``_derive_vision_input_shape``, and + ``_derive_nlp_input_shapes``. They are pure unit tests (no CUDA / no HF + network) and rely on mocking the model loader, ONNX exporter, and the + ``onnx`` loader to keep them fast and deterministic. + """ + + benchmark_name = 'tensorrt-inference' + + def _make_benchmark(self, **arg_overrides): + """Build a benchmark instance with mock args and bin/workspace state. + + Mimics the post-``_preprocess`` state of the object (bin path and + workspace flag already resolved) without actually invoking trtexec or + touching the filesystem. + """ + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( + self.benchmark_name, Platform.CUDA + ) + benchmark = benchmark_cls(self.benchmark_name, parameters='') + benchmark._result = BenchmarkResult( + self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1 + ) + defaults = dict( + model_source='huggingface', + model_identifier='prajjwal1/bert-tiny', + allow_remote_code=False, + precision='fp16', + batch_size=8, + seq_length=128, + iterations=128, + pytorch_models=[], + log_raw_data=False, + ) + defaults.update(arg_overrides) + benchmark._args = SimpleNamespace(**defaults) + # Set name-mangled private attributes that _preprocess() normally fills in. + benchmark._TensorRTInferenceBenchmark__bin_path = '/fake/bin/trtexec' + benchmark._TensorRTInferenceBenchmark__workspace_flag = '--memPoolSize=workspace:8192M' + benchmark._commands = [] + return benchmark + + # ------------------------------------------------------------------ + # _preprocess_huggingface_models + # ------------------------------------------------------------------ + + def test_preprocess_hf_missing_model_identifier(self): + """Missing --model_identifier is rejected before any HF I/O.""" + benchmark = self._make_benchmark(model_identifier=None) + + self.assertFalse(benchmark._preprocess_huggingface_models()) + self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code) + self.assertEqual([], benchmark._commands) + + def test_preprocess_hf_invalid_identifier(self): + """Path-like / unsafe identifier is rejected by validate_model_identifier.""" + benchmark = self._make_benchmark(model_identifier='../etc/passwd') + + self.assertFalse(benchmark._preprocess_huggingface_models()) + self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code) + + def test_preprocess_hf_int8_rejected(self): + """INT8 on the HF path is rejected (no calibration data emitted).""" + benchmark = self._make_benchmark(precision='int8') + + self.assertFalse(benchmark._preprocess_huggingface_models()) + self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code) + + def test_preprocess_hf_memory_check_fails(self): + """When check_memory_fits reports fits=False, preprocess fails.""" + benchmark = self._make_benchmark() + + fake_config = MagicMock(name='AutoConfigInstance') + with patch('transformers.AutoConfig') as mock_auto_config, \ + patch(f'{_TENSORRT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls: + mock_auto_config.from_pretrained.return_value = fake_config + mock_loader_cls.check_memory_fits.return_value = (False, 1000.0, 30.0, 16.0) + + self.assertFalse(benchmark._preprocess_huggingface_models()) + + self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code) + mock_auto_config.from_pretrained.assert_called_once() + + def test_preprocess_hf_auto_config_exception(self): + """An exception while downloading the config is caught and converted to failure.""" + benchmark = self._make_benchmark() + + with patch('transformers.AutoConfig') as mock_auto_config: + mock_auto_config.from_pretrained.side_effect = RuntimeError('boom') + + self.assertFalse(benchmark._preprocess_huggingface_models()) + + self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code) + + def test_preprocess_hf_happy_path_delegates_to_build_command(self): + """Happy path: config + memory check pass and the build helper is invoked.""" + benchmark = self._make_benchmark() + + fake_config = MagicMock(name='AutoConfigInstance') + with patch('transformers.AutoConfig') as mock_auto_config, \ + patch(f'{_TENSORRT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls, \ + patch.object( + benchmark, '_build_trtexec_command_for_hf', return_value=True + ) as mock_build: + mock_auto_config.from_pretrained.return_value = fake_config + mock_loader_cls.check_memory_fits.return_value = (True, 4.0, 0.02, 16.0) + + self.assertTrue(benchmark._preprocess_huggingface_models()) + + # AutoConfig must be called with trust_remote_code matching --allow_remote_code (False here). + config_kwargs = mock_auto_config.from_pretrained.call_args.kwargs + self.assertFalse(config_kwargs['trust_remote_code']) + # Memory check must run for fp32 (ONNX export dtype) regardless of --precision. + mem_args, mem_kwargs = mock_loader_cls.check_memory_fits.call_args + self.assertEqual('fp32', mem_args[2]) + self.assertEqual('inference', mem_kwargs.get('mode')) + mock_build.assert_called_once() + + def test_preprocess_hf_allow_remote_code_propagates(self): + """--allow_remote_code is forwarded as trust_remote_code=True.""" + benchmark = self._make_benchmark(allow_remote_code=True) + + with patch('transformers.AutoConfig') as mock_auto_config, \ + patch(f'{_TENSORRT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls, \ + patch.object(benchmark, '_build_trtexec_command_for_hf', return_value=True): + mock_auto_config.from_pretrained.return_value = MagicMock() + mock_loader_cls.check_memory_fits.return_value = (True, 1.0, 0.01, 16.0) + + benchmark._preprocess_huggingface_models() + + self.assertTrue(mock_auto_config.from_pretrained.call_args.kwargs['trust_remote_code']) + + # ------------------------------------------------------------------ + # _build_trtexec_command_for_hf + # ------------------------------------------------------------------ + + def _patch_build_dependencies(self, onnx_path='/tmp/fake.onnx', input_shapes='input_ids:8x128'): + """Common patch context for _build_trtexec_command_for_hf tests.""" + loader_patch = patch(f'{_TENSORRT_MODULE}.HuggingFaceModelLoader') + msc_patch = patch(f'{_TENSORRT_MODULE}.ModelSourceConfig') + exporter_patch = patch(f'{_TENSORRT_MODULE}.torch2onnxExporter') + makedirs_patch = patch(f'{_TENSORRT_MODULE}.os.makedirs') + torch_patch = patch(f'{_TENSORRT_MODULE}.torch') + return loader_patch, msc_patch, exporter_patch, makedirs_patch, torch_patch + + def test_build_trtexec_command_for_hf_success(self): + """Happy path: command is appended and shape/precision flags are correct.""" + benchmark = self._make_benchmark(precision='fp16') + + loader_p, msc_p, exporter_p, makedirs_p, torch_p = self._patch_build_dependencies() + derived_shapes = 'input_ids:8x128,attention_mask:8x128' + with loader_p as mock_loader_cls, msc_p as mock_msc, exporter_p as mock_exporter_cls, \ + makedirs_p as mock_makedirs, torch_p as mock_torch, \ + patch.object(benchmark, '_derive_trt_input_shapes', return_value=derived_shapes) as mock_derive: + mock_torch.hub.get_dir.return_value = '/tmp/torchhub' + mock_torch.cuda.is_available.return_value = False + + mock_loader = MagicMock() + mock_loader_cls.return_value = mock_loader + mock_hf_model = MagicMock(name='HFModel') + mock_hf_config = MagicMock(name='HFConfig') + mock_loader.load_model_from_config.return_value = (mock_hf_model, mock_hf_config, None) + + mock_exporter = MagicMock() + mock_exporter_cls.return_value = mock_exporter + mock_exporter.export_huggingface_model.return_value = '/tmp/torchhub/checkpoints/trt_rank_0/m.onnx' + + ok = benchmark._build_trtexec_command_for_hf(hf_token=None, allow_remote_code=False) + + self.assertTrue(ok) + self.assertIs(benchmark._hf_config, mock_hf_config) + # makedirs called once with the rank-scoped output dir. + mock_makedirs.assert_called_once() + self.assertTrue(mock_makedirs.call_args.args[0].endswith('trt_rank_0')) + # ModelSourceConfig is constructed with float32 + device_map=None (CPU load). + msc_kwargs = mock_msc.call_args.kwargs + self.assertEqual('float32', msc_kwargs['torch_dtype']) + self.assertIsNone(msc_kwargs['device_map']) + self.assertEqual('huggingface', msc_kwargs['source']) + # Exporter called with the configured batch_size / seq_length. + export_kwargs = mock_exporter.export_huggingface_model.call_args.kwargs + self.assertEqual(8, export_kwargs['batch_size']) + self.assertEqual(128, export_kwargs['seq_length']) + # _derive_trt_input_shapes was invoked with the exported ONNX path. + mock_derive.assert_called_once_with('/tmp/torchhub/checkpoints/trt_rank_0/m.onnx') + # Exactly one command appended, containing the expected flags. + self.assertEqual(1, len(benchmark._commands)) + cmd = benchmark._commands[0] + self.assertIn('/fake/bin/trtexec', cmd) + self.assertIn('--onnx=/tmp/torchhub/checkpoints/trt_rank_0/m.onnx', cmd) + self.assertIn(f'--optShapes={derived_shapes}', cmd) + self.assertIn('--memPoolSize=workspace:8192M', cmd) + self.assertIn('--fp16', cmd) + self.assertIn('--iterations=128', cmd) + self.assertIn('--percentile=99', cmd) + # pytorch_models is rewritten so _process_raw_result can key off the HF id. + self.assertEqual(['prajjwal1_bert-tiny'], benchmark._args.pytorch_models) + + def test_build_trtexec_command_for_hf_fp32_omits_precision_flag(self): + """fp32 precision must not emit a ``--fp32`` or ``--int8`` flag.""" + benchmark = self._make_benchmark(precision='fp32') + + loader_p, msc_p, exporter_p, makedirs_p, torch_p = self._patch_build_dependencies() + with loader_p as mock_loader_cls, msc_p, exporter_p as mock_exporter_cls, \ + makedirs_p, torch_p as mock_torch, \ + patch.object(benchmark, '_derive_trt_input_shapes', return_value='input_ids:8x128'): + mock_torch.hub.get_dir.return_value = '/tmp/torchhub' + mock_torch.cuda.is_available.return_value = False + mock_loader_cls.return_value.load_model_from_config.return_value = (MagicMock(), MagicMock(), None) + mock_exporter_cls.return_value.export_huggingface_model.return_value = ( + '/tmp/torchhub/checkpoints/trt_rank_0/m.onnx' + ) + + self.assertTrue(benchmark._build_trtexec_command_for_hf(None, False)) + + cmd = benchmark._commands[0] + self.assertNotIn('--fp16', cmd) + self.assertNotIn('--fp32', cmd) + self.assertNotIn('--int8', cmd) + + def test_build_trtexec_command_for_hf_export_failure(self): + """If ONNX export returns falsy, the build fails and no command is queued.""" + benchmark = self._make_benchmark() + + loader_p, msc_p, exporter_p, makedirs_p, torch_p = self._patch_build_dependencies() + with loader_p as mock_loader_cls, msc_p, exporter_p as mock_exporter_cls, \ + makedirs_p, torch_p as mock_torch: + mock_torch.hub.get_dir.return_value = '/tmp/torchhub' + mock_torch.cuda.is_available.return_value = False + mock_loader_cls.return_value.load_model_from_config.return_value = (MagicMock(), MagicMock(), None) + mock_exporter_cls.return_value.export_huggingface_model.return_value = None + + self.assertFalse(benchmark._build_trtexec_command_for_hf(None, False)) + + self.assertEqual([], benchmark._commands) + self.assertEqual(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE, benchmark.return_code) + + def test_build_trtexec_command_for_hf_uses_proc_rank_env(self): + """PROC_RANK env var (or CUDA_VISIBLE_DEVICES) controls the rank subdir.""" + benchmark = self._make_benchmark() + + loader_p, msc_p, exporter_p, makedirs_p, torch_p = self._patch_build_dependencies() + with loader_p as mock_loader_cls, msc_p, exporter_p as mock_exporter_cls, \ + makedirs_p as mock_makedirs, torch_p as mock_torch, \ + patch.object(benchmark, '_derive_trt_input_shapes', return_value='input_ids:8x128'), \ + patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0', 'PROC_RANK': '3'}, clear=False): + mock_torch.hub.get_dir.return_value = '/tmp/torchhub' + mock_torch.cuda.is_available.return_value = False + mock_loader_cls.return_value.load_model_from_config.return_value = (MagicMock(), MagicMock(), None) + mock_exporter_cls.return_value.export_huggingface_model.return_value = ( + '/tmp/torchhub/checkpoints/trt_rank_3/m.onnx' + ) + + self.assertTrue(benchmark._build_trtexec_command_for_hf(None, False)) + + self.assertTrue(mock_makedirs.call_args.args[0].endswith('trt_rank_3')) + + # ------------------------------------------------------------------ + # _derive_trt_input_shapes + # ------------------------------------------------------------------ + + def test_derive_trt_input_shapes_vision_by_pixel_values_name(self): + """Inputs named ``pixel_values`` are routed to the vision helper.""" + benchmark = self._make_benchmark(batch_size=4) + # 3D so we rely on the name heuristic, not the dim-count heuristic. + vision_input = _make_onnx_input('pixel_values', [0, 3, 224]) + # Pad the input to 4D so vision helper can index dims[1..3] safely. + vision_input_4d = _make_onnx_input('pixel_values', [0, 3, 224, 224]) + model = _make_onnx_model([vision_input_4d]) + + with patch('onnx.load', return_value=model): + shapes = benchmark._derive_trt_input_shapes('/tmp/fake.onnx') + + self.assertEqual('pixel_values:4x3x224x224', shapes) + _ = vision_input # silence unused-warning in case of future refactor + + def test_derive_trt_input_shapes_vision_by_4d_shape(self): + """A 4D non-``pixel_values`` input is still treated as vision.""" + benchmark = self._make_benchmark(batch_size=2) + vision_input = _make_onnx_input('image', [0, 3, 256, 256]) + model = _make_onnx_model([vision_input]) + + with patch('onnx.load', return_value=model): + shapes = benchmark._derive_trt_input_shapes('/tmp/fake.onnx') + + self.assertEqual('image:2x3x256x256', shapes) + + def test_derive_trt_input_shapes_nlp_multi_input(self): + """NLP routing: 2D inputs are emitted as ``name:BxS`` and comma-joined.""" + benchmark = self._make_benchmark(batch_size=4, seq_length=64) + inputs = [ + _make_onnx_input('input_ids', [0, 0]), + _make_onnx_input('attention_mask', [0, 0]), + ] + model = _make_onnx_model(inputs) + + with patch('onnx.load', return_value=model): + shapes = benchmark._derive_trt_input_shapes('/tmp/fake.onnx') + + self.assertEqual('input_ids:4x64,attention_mask:4x64', shapes) + + def test_derive_trt_input_shapes_filters_initializers(self): + """Initializer-named graph inputs are excluded from runtime inputs.""" + benchmark = self._make_benchmark(batch_size=1, seq_length=16) + runtime = _make_onnx_input('input_ids', [0, 0]) + weight = _make_onnx_input('weight', [768, 768]) + model = _make_onnx_model([weight, runtime], initializer_names=['weight']) + + with patch('onnx.load', return_value=model): + shapes = benchmark._derive_trt_input_shapes('/tmp/fake.onnx') + + self.assertEqual('input_ids:1x16', shapes) + + def test_derive_trt_input_shapes_no_runtime_inputs_raises(self): + """A graph with only initializer-shadowed inputs raises ValueError.""" + benchmark = self._make_benchmark() + weight = _make_onnx_input('weight', [768, 768]) + model = _make_onnx_model([weight], initializer_names=['weight']) + + with patch('onnx.load', return_value=model): + with self.assertRaises(ValueError): + benchmark._derive_trt_input_shapes('/tmp/fake.onnx') + + # ------------------------------------------------------------------ + # _derive_vision_input_shape + # ------------------------------------------------------------------ + + def test_derive_vision_input_shape_static_dims(self): + """Static ONNX dims are used verbatim (apart from the batch dim).""" + benchmark = self._make_benchmark(batch_size=16) + vision_input = _make_onnx_input('pixel_values', [0, 3, 384, 384]) + + result = benchmark._derive_vision_input_shape(vision_input, 'pixel_values') + + self.assertEqual('pixel_values:16x3x384x384', result) + + def test_derive_vision_input_shape_dynamic_with_hf_config_scalar(self): + """Dynamic dims fall back to ``_hf_config`` (scalar ``image_size``).""" + benchmark = self._make_benchmark(batch_size=4) + benchmark._hf_config = SimpleNamespace(num_channels=1, image_size=160) + vision_input = _make_onnx_input('pixel_values', [0, 0, 0, 0]) + + result = benchmark._derive_vision_input_shape(vision_input, 'pixel_values') + + self.assertEqual('pixel_values:4x1x160x160', result) + + def test_derive_vision_input_shape_dynamic_with_hf_config_tuple(self): + """Dynamic dims fall back to ``_hf_config`` (tuple/list ``image_size``).""" + benchmark = self._make_benchmark(batch_size=2) + benchmark._hf_config = SimpleNamespace(num_channels=3, image_size=(192, 384)) + vision_input = _make_onnx_input('pixel_values', [0, 0, 0, 0]) + + result = benchmark._derive_vision_input_shape(vision_input, 'pixel_values') + + self.assertEqual('pixel_values:2x3x192x384', result) + + def test_derive_vision_input_shape_dynamic_without_hf_config_uses_defaults(self): + """No ``_hf_config`` + dynamic dims → default (3, 224, 224).""" + benchmark = self._make_benchmark(batch_size=1) + # Ensure no _hf_config is set. + if hasattr(benchmark, '_hf_config'): + del benchmark._hf_config + vision_input = _make_onnx_input('pixel_values', [0, 0, 0, 0]) + + result = benchmark._derive_vision_input_shape(vision_input, 'pixel_values') + + self.assertEqual('pixel_values:1x3x224x224', result) + + # ------------------------------------------------------------------ + # _derive_nlp_input_shapes + # ------------------------------------------------------------------ + + def test_derive_nlp_input_shapes_single_2d(self): + """A single 2D input emits a single ``name:BxS`` entry.""" + benchmark = self._make_benchmark(batch_size=8, seq_length=256) + inputs = [_make_onnx_input('input_ids', [0, 0])] + + result = benchmark._derive_nlp_input_shapes(inputs) + + self.assertEqual('input_ids:8x256', result) + + def test_derive_nlp_input_shapes_multiple_inputs(self): + """Multiple inputs are joined with commas in declaration order.""" + benchmark = self._make_benchmark(batch_size=4, seq_length=64) + inputs = [ + _make_onnx_input('input_ids', [0, 0]), + _make_onnx_input('attention_mask', [0, 0]), + _make_onnx_input('token_type_ids', [0, 0]), + ] + + result = benchmark._derive_nlp_input_shapes(inputs) + + self.assertEqual( + 'input_ids:4x64,attention_mask:4x64,token_type_ids:4x64', + result, + ) + + def test_derive_nlp_input_shapes_4d_input_uses_bx1xsxs(self): + """A 4D input (rare for NLP) gets the ``Bx1xSxS`` shape.""" + benchmark = self._make_benchmark(batch_size=2, seq_length=32) + inputs = [_make_onnx_input('attention_bias', [0, 0, 0, 0])] + + result = benchmark._derive_nlp_input_shapes(inputs) + + self.assertEqual('attention_bias:2x1x32x32', result) + + def test_derive_nlp_input_shapes_default_seq_length_when_missing(self): + """When ``_args.seq_length`` is absent, the helper defaults to 512.""" + benchmark = self._make_benchmark() + # Strip seq_length to trigger the getattr-default branch. + del benchmark._args.seq_length + inputs = [_make_onnx_input('input_ids', [0, 0])] + + result = benchmark._derive_nlp_input_shapes(inputs) + + self.assertEqual('input_ids:8x512', result) From 466f5ad6494153003a19c798d3c02cbe92d0c905 Mon Sep 17 00:00:00 2001 From: Aishwarya-Tonpe Date: Thu, 4 Jun 2026 18:13:41 +0000 Subject: [PATCH 12/12] Increasing test coverage by adding new tests - coverage fell short due to AI suggested comments resolution code --- .../ort_inference_performance.py | 1 - .../tensorrt_inference_performance.py | 1 - .../test_export_torch_to_onnx.py | 450 ++++++++++++++++++ .../test_ort_inference_performance.py | 306 ++++++++++++ .../test_tensorrt_inference_performance.py | 13 +- 5 files changed, 760 insertions(+), 11 deletions(-) create mode 100644 tests/benchmarks/micro_benchmarks/test_export_torch_to_onnx.py diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py index 309c37b06..37f95a1ab 100644 --- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py @@ -23,7 +23,6 @@ class ORTInferenceBenchmark(MicroBenchmark): """ONNXRuntime inference micro-benchmark class.""" - def __init__(self, name, parameters=''): """Constructor. diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py index 386e84dc0..5e09a8f1c 100644 --- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py @@ -23,7 +23,6 @@ class TensorRTInferenceBenchmark(MicroBenchmarkWithInvoke): """TensorRT inference micro-benchmark class.""" - def __init__(self, name, parameters=''): """Constructor. diff --git a/tests/benchmarks/micro_benchmarks/test_export_torch_to_onnx.py b/tests/benchmarks/micro_benchmarks/test_export_torch_to_onnx.py new file mode 100644 index 000000000..a5ac5b0ce --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_export_torch_to_onnx.py @@ -0,0 +1,450 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Unit tests for ``torch2onnxExporter`` HuggingFace export helpers. + +Covers: +- ``export_huggingface_model`` (orchestration, error path, vision/NLP routing). +- ``_build_vision_export_inputs`` (config-driven C/H/W, VisionModelWrapper). +- ``_build_nlp_export_inputs`` (input_ids + attention_mask, NLPModelWrapper). +- ``_build_onnx_export_kwargs`` (opset/dynamic_axes; external-data branch). + +Tests are pure-CPU and pure-unit: ``torch.onnx.export`` is patched out so we +never touch the ONNX runtime, and dummy ``torch.nn.Module`` instances stand in +for HuggingFace models. +""" + +import inspect +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter + +_EXPORTER_MODULE = 'superbench.benchmarks.micro_benchmarks._export_torch_to_onnx' + +# --------------------------------------------------------------------------- +# Fixtures / helpers +# --------------------------------------------------------------------------- + + +@pytest.fixture +def exporter(tmp_path, monkeypatch): + """Build a torch2onnxExporter rooted at a tmp dir to avoid touching the real torch hub.""" + monkeypatch.setattr(torch.hub, 'get_dir', lambda: str(tmp_path)) + return torch2onnxExporter() + + +class _TinyVisionModel(torch.nn.Module): + """Minimal stand-in for a HuggingFace vision model. + + Mimics enough of the HF API for the export helpers: ``main_input_name``, + a ``config`` namespace, and a ``forward`` that accepts ``pixel_values`` and + returns an object with a ``logits`` attribute. + """ + + main_input_name = 'pixel_values' + + def __init__(self, num_channels=3, image_size=224, num_classes=4): + super().__init__() + self.config = SimpleNamespace( + num_channels=num_channels, + image_size=image_size, + use_cache=True, + ) + # A trivial trainable parameter so .parameters() / .element_size() are exercised. + self.linear = torch.nn.Linear(num_channels, num_classes) + + def forward(self, pixel_values): + # Reduce H/W and project channel dim, mimicking a classifier head. + flat = pixel_values.mean(dim=(2, 3)) + return SimpleNamespace(logits=self.linear(flat)) + + +class _TinyNLPModel(torch.nn.Module): + """Minimal stand-in for a HuggingFace NLP model with input_ids + attention_mask.""" + + main_input_name = 'input_ids' + + def __init__(self, vocab_size=128, hidden=8): + super().__init__() + self.config = SimpleNamespace(use_cache=True) + self.embed = torch.nn.Embedding(vocab_size, hidden) + + def forward(self, input_ids, attention_mask): + h = self.embed(input_ids) + # last_hidden_state path is exercised here. + return SimpleNamespace(last_hidden_state=h * attention_mask.unsqueeze(-1).to(h.dtype)) + + +# --------------------------------------------------------------------------- +# _build_vision_export_inputs +# --------------------------------------------------------------------------- + + +def test_build_vision_export_inputs_default_shape(exporter): + """Default config (3 channels, 224 image_size) → (B, 3, 224, 224) tensor.""" + model = _TinyVisionModel(num_channels=3, image_size=224) + + wrapped, args, names, axes = exporter._build_vision_export_inputs( + model, batch_size=2, model_dtype=torch.float32, device='cpu' + ) + + assert names == ['pixel_values'] + assert axes == {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}} + assert len(args) == 1 + pixel_values = args[0] + assert tuple(pixel_values.shape) == (2, 3, 224, 224) + assert pixel_values.dtype == torch.float32 + assert pixel_values.device.type == 'cpu' + # Wrapper is callable and returns the inner model's logits tensor (not a SimpleNamespace). + out = wrapped(pixel_values) + assert isinstance(out, torch.Tensor) + + +def test_build_vision_export_inputs_custom_channels_and_size(exporter): + """Non-default num_channels / scalar image_size are honored.""" + model = _TinyVisionModel(num_channels=1, image_size=384) + + _, args, _, _ = exporter._build_vision_export_inputs(model, batch_size=4, model_dtype=torch.float32, device='cpu') + pixel_values = args[0] + assert tuple(pixel_values.shape) == (4, 1, 384, 384) + + +def test_build_vision_export_inputs_tuple_image_size(exporter): + """Tuple/list ``image_size`` is unpacked as (H, W).""" + model = _TinyVisionModel(num_channels=3) + model.config.image_size = (192, 384) + + _, args, _, _ = exporter._build_vision_export_inputs(model, batch_size=1, model_dtype=torch.float32, device='cpu') + assert tuple(args[0].shape) == (1, 3, 192, 384) + + +def test_build_vision_export_inputs_wrapper_handles_last_hidden_state(exporter): + """The wrapper falls back to ``last_hidden_state`` when ``logits`` is absent.""" + model = _TinyVisionModel() + # Override forward to return only last_hidden_state. + hidden = torch.zeros(2, 4) + + class _ModelOnlyHidden(torch.nn.Module): + main_input_name = 'pixel_values' + + def __init__(self): + super().__init__() + self.config = SimpleNamespace(num_channels=3, image_size=8) + self.dummy = torch.nn.Linear(1, 1) + + def forward(self, pixel_values): + return SimpleNamespace(last_hidden_state=hidden) + + custom = _ModelOnlyHidden() + wrapped, args, _, _ = exporter._build_vision_export_inputs( + custom, batch_size=2, model_dtype=torch.float32, device='cpu' + ) + out = wrapped(args[0]) + assert torch.equal(out, hidden) + _ = model # keep fixture-ish ref + + +def test_build_vision_export_inputs_wrapper_handles_tuple_output(exporter): + """The wrapper returns ``outputs[0]`` when the model emits a tuple.""" + + class _TupleModel(torch.nn.Module): + main_input_name = 'pixel_values' + + def __init__(self): + super().__init__() + self.config = SimpleNamespace(num_channels=3, image_size=8) + self.dummy = torch.nn.Linear(1, 1) + + def forward(self, pixel_values): + return (torch.ones(pixel_values.size(0), 2), torch.zeros(1)) + + wrapped, args, _, _ = exporter._build_vision_export_inputs( + _TupleModel(), batch_size=3, model_dtype=torch.float32, device='cpu' + ) + out = wrapped(args[0]) + assert tuple(out.shape) == (3, 2) + + +# --------------------------------------------------------------------------- +# _build_nlp_export_inputs +# --------------------------------------------------------------------------- + + +def test_build_nlp_export_inputs_basic(exporter): + """NLP path emits int64 ``input_ids`` + ``attention_mask`` of shape (B, S).""" + model = _TinyNLPModel() + + wrapped, args, names, axes = exporter._build_nlp_export_inputs(model, batch_size=2, seq_length=16, device='cpu') + + assert names == ['input_ids', 'attention_mask'] + # Dynamic axes: batch_size + seq_length on both inputs and the output. + assert axes['input_ids'] == {0: 'batch_size', 1: 'seq_length'} + assert axes['attention_mask'] == {0: 'batch_size', 1: 'seq_length'} + assert axes['output'] == {0: 'batch_size', 1: 'seq_length'} + assert len(args) == 2 + input_ids, attention_mask = args + assert tuple(input_ids.shape) == (2, 16) + assert tuple(attention_mask.shape) == (2, 16) + assert input_ids.dtype == torch.int64 + assert attention_mask.dtype == torch.int64 + # All ones ⇒ token id 1 is within the embedding's vocab. + assert torch.all(input_ids == 1) + # Wrapper runs the inner model and unwraps last_hidden_state. + out = wrapped(input_ids, attention_mask) + assert isinstance(out, torch.Tensor) + assert tuple(out.shape) == (2, 16, 8) + + +def test_build_nlp_export_inputs_wrapper_handles_logits(exporter): + """When the inner model exposes ``logits``, the wrapper returns those.""" + + class _LogitsModel(torch.nn.Module): + main_input_name = 'input_ids' + + def __init__(self): + super().__init__() + self.config = SimpleNamespace() + self.embed = torch.nn.Embedding(8, 4) + + def forward(self, input_ids, attention_mask): + return SimpleNamespace(logits=self.embed(input_ids)) + + wrapped, args, _, _ = exporter._build_nlp_export_inputs(_LogitsModel(), batch_size=1, seq_length=4, device='cpu') + out = wrapped(*args) + assert isinstance(out, torch.Tensor) + assert tuple(out.shape) == (1, 4, 4) + + +def test_build_nlp_export_inputs_wrapper_handles_tuple(exporter): + """The wrapper returns ``outputs[0]`` when the model emits a tuple.""" + + class _TupleNLP(torch.nn.Module): + main_input_name = 'input_ids' + + def __init__(self): + super().__init__() + self.config = SimpleNamespace() + self.embed = torch.nn.Embedding(8, 4) + + def forward(self, input_ids, attention_mask): + return (self.embed(input_ids), torch.zeros(1)) + + wrapped, args, _, _ = exporter._build_nlp_export_inputs(_TupleNLP(), batch_size=1, seq_length=2, device='cpu') + out = wrapped(*args) + assert tuple(out.shape) == (1, 2, 4) + + +# --------------------------------------------------------------------------- +# _build_onnx_export_kwargs +# --------------------------------------------------------------------------- + + +def test_build_onnx_export_kwargs_small_model_no_external_data(exporter): + """Small models (< 2GB) do not request external-data format.""" + model = _TinyNLPModel() + input_names = ['input_ids', 'attention_mask'] + dynamic_axes = {'input_ids': {0: 'b'}} + + kwargs = exporter._build_onnx_export_kwargs(model, input_names, dynamic_axes) + + assert kwargs['opset_version'] == 14 + assert kwargs['do_constant_folding'] is True + assert kwargs['input_names'] == input_names + assert kwargs['output_names'] == ['output'] + assert kwargs['dynamic_axes'] is dynamic_axes + assert 'external_data' not in kwargs + assert 'use_external_data_format' not in kwargs + + +def test_build_onnx_export_kwargs_large_model_uses_external_data_modern(exporter): + """For >2GB models on PyTorch with ``external_data`` param, that key is used.""" + fake_model = MagicMock() + # 3 GB worth of fp32 params = 3 * (1024**3) / 4 numel. + big_param = SimpleNamespace( + numel=lambda: int(3 * (1024**3) / 4), + element_size=lambda: 4, + ) + fake_model.parameters.return_value = [big_param] + + fake_sig = inspect.Signature( + parameters=[inspect.Parameter('external_data', inspect.Parameter.POSITIONAL_OR_KEYWORD)] + ) + with patch(f'{_EXPORTER_MODULE}.inspect.signature', return_value=fake_sig): + kwargs = exporter._build_onnx_export_kwargs(fake_model, ['input_ids'], {}) + + assert kwargs['external_data'] is True + assert 'use_external_data_format' not in kwargs + + +def test_build_onnx_export_kwargs_large_model_uses_external_data_legacy(exporter): + """For >2GB models on older PyTorch, ``use_external_data_format`` is used instead.""" + fake_model = MagicMock() + big_param = SimpleNamespace( + numel=lambda: int(3 * (1024**3) / 4), + element_size=lambda: 4, + ) + fake_model.parameters.return_value = [big_param] + + fake_sig = inspect.Signature( + parameters=[inspect.Parameter('use_external_data_format', inspect.Parameter.POSITIONAL_OR_KEYWORD)] + ) + with patch(f'{_EXPORTER_MODULE}.inspect.signature', return_value=fake_sig): + kwargs = exporter._build_onnx_export_kwargs(fake_model, ['input_ids'], {}) + + assert kwargs['use_external_data_format'] is True + assert 'external_data' not in kwargs + + +# --------------------------------------------------------------------------- +# export_huggingface_model +# --------------------------------------------------------------------------- + + +def test_export_huggingface_model_vision_routes_to_vision_helper(exporter, tmp_path): + """Vision model (main_input_name='pixel_values') uses the vision helper.""" + model = _TinyVisionModel(num_channels=3, image_size=32) + + captured = {} + + def fake_export(wrapped_model, args, file_name, **kwargs): + captured['wrapped_model'] = wrapped_model + captured['args'] = args + captured['file_name'] = file_name + captured['kwargs'] = kwargs + Path(file_name).touch() + + with patch(f'{_EXPORTER_MODULE}.torch.onnx.export', side_effect=fake_export): + result = exporter.export_huggingface_model( + model=model, + model_name='vit-tiny', + batch_size=2, + seq_length=16, + output_dir=str(tmp_path), + ) + + assert result == str(tmp_path / 'vit-tiny.onnx') + # Vision shape: (B, C, H, W) tuple of length 1. + assert len(captured['args']) == 1 + assert tuple(captured['args'][0].shape) == (2, 3, 32, 32) + assert captured['kwargs']['input_names'] == ['pixel_values'] + assert captured['kwargs']['opset_version'] == 14 + # use_cache disabled to avoid DynamicCache issues. + assert model.config.use_cache is False + + +def test_export_huggingface_model_nlp_routes_to_nlp_helper(exporter, tmp_path): + """NLP model (main_input_name='input_ids') uses the NLP helper.""" + model = _TinyNLPModel() + + captured = {} + + def fake_export(wrapped_model, args, file_name, **kwargs): + captured['args'] = args + captured['file_name'] = file_name + captured['kwargs'] = kwargs + Path(file_name).touch() + + with patch(f'{_EXPORTER_MODULE}.torch.onnx.export', side_effect=fake_export): + result = exporter.export_huggingface_model( + model=model, + model_name='bert-tiny', + batch_size=2, + seq_length=8, + output_dir=str(tmp_path), + ) + + assert result == str(tmp_path / 'bert-tiny.onnx') + assert len(captured['args']) == 2 + input_ids, attention_mask = captured['args'] + assert tuple(input_ids.shape) == (2, 8) + assert tuple(attention_mask.shape) == (2, 8) + assert captured['kwargs']['input_names'] == ['input_ids', 'attention_mask'] + + +def test_export_huggingface_model_default_output_dir(exporter): + """When ``output_dir`` is None, the exporter writes under self._onnx_model_path.""" + model = _TinyNLPModel() + + written = {} + + def fake_export(wrapped_model, args, file_name, **kwargs): + written['file_name'] = file_name + Path(file_name).touch() + + with patch(f'{_EXPORTER_MODULE}.torch.onnx.export', side_effect=fake_export): + result = exporter.export_huggingface_model(model=model, model_name='bert-default') + + expected = str(exporter._onnx_model_path / 'bert-default.onnx') + assert result == expected + assert written['file_name'] == expected + + +def test_export_huggingface_model_handles_export_failure(exporter, tmp_path): + """If ``torch.onnx.export`` raises, the helper returns '' and logs the error.""" + model = _TinyNLPModel() + + with patch(f'{_EXPORTER_MODULE}.torch.onnx.export', side_effect=RuntimeError('boom')): + result = exporter.export_huggingface_model( + model=model, + model_name='bert-fail', + batch_size=1, + seq_length=4, + output_dir=str(tmp_path), + ) + + assert result == '' + + +def test_export_huggingface_model_disables_use_cache(exporter, tmp_path): + """``model.config.use_cache`` is forced to False before export.""" + model = _TinyNLPModel() + model.config.use_cache = True + + with patch(f'{_EXPORTER_MODULE}.torch.onnx.export') as mock_export: + mock_export.side_effect = lambda *a, **kw: Path(a[2]).touch() + exporter.export_huggingface_model( + model=model, + model_name='bert-cache', + batch_size=1, + seq_length=4, + output_dir=str(tmp_path), + ) + + assert model.config.use_cache is False + + +def test_export_huggingface_model_default_main_input_name_is_nlp(exporter, tmp_path): + """Models without ``main_input_name`` default to the NLP path.""" + + class _NoMainInput(torch.nn.Module): + # Intentionally no main_input_name attribute. + def __init__(self): + super().__init__() + self.config = SimpleNamespace(use_cache=False) + self.embed = torch.nn.Embedding(8, 4) + + def forward(self, input_ids, attention_mask): + return SimpleNamespace(last_hidden_state=self.embed(input_ids)) + + captured = {} + + def fake_export(wrapped_model, args, file_name, **kwargs): + captured['kwargs'] = kwargs + Path(file_name).touch() + + with patch(f'{_EXPORTER_MODULE}.torch.onnx.export', side_effect=fake_export): + result = exporter.export_huggingface_model( + model=_NoMainInput(), + model_name='no-main', + batch_size=1, + seq_length=4, + output_dir=str(tmp_path), + ) + + assert result == str(tmp_path / 'no-main.onnx') + assert captured['kwargs']['input_names'] == ['input_ids', 'attention_mask'] diff --git a/tests/benchmarks/micro_benchmarks/test_ort_inference_performance.py b/tests/benchmarks/micro_benchmarks/test_ort_inference_performance.py index 9d9d1b0db..1c20e9b11 100644 --- a/tests/benchmarks/micro_benchmarks/test_ort_inference_performance.py +++ b/tests/benchmarks/micro_benchmarks/test_ort_inference_performance.py @@ -5,14 +5,18 @@ import shutil from pathlib import Path +from types import SimpleNamespace from unittest import mock +from unittest.mock import MagicMock, patch +import pytest import torch import torchvision.models from tests.helper import decorator from superbench.benchmarks import BenchmarkRegistry, Platform, Precision, BenchmarkType, ReturnCode from superbench.benchmarks.micro_benchmarks.ort_inference_performance import ORTInferenceBenchmark +from superbench.benchmarks.result import BenchmarkResult @decorator.cuda_test @@ -66,3 +70,305 @@ def test_ort_inference_performance(mock_ort_session_run, mock_get_dir): metric = '{}_{}_time'.format(precision, model) assert (metric in benchmark.result) assert (metric in benchmark.raw_data) + + +# --------------------------------------------------------------------------- +# HuggingFace-path coverage for _preprocess_huggingface_models and +# _export_hf_model_to_onnx. These tests are pure unit tests with no CUDA / no +# HF network access; the model loader, ModelSourceConfig, and torch2onnxExporter +# are all mocked to keep the suite fast and deterministic. +# --------------------------------------------------------------------------- + +_ORT_MODULE = 'superbench.benchmarks.micro_benchmarks.ort_inference_performance' + + +def _make_ort_benchmark(**arg_overrides): + """Build an ORTInferenceBenchmark and minimally initialise its mutable state. + + Returns the benchmark with ``_args``, ``_result``, and the name-mangled + cache-path attribute populated so HF-path methods can be exercised in + isolation without going through the full ``_preprocess`` pipeline. + """ + benchmark = ORTInferenceBenchmark('ort-inference', parameters='') + benchmark._result = BenchmarkResult('ort-inference', BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1) + defaults = dict( + model_source='huggingface', + model_identifier='prajjwal1/bert-tiny', + allow_remote_code=False, + precision=Precision.FLOAT16, + batch_size=8, + seq_length=128, + graph_opt_level=3, + num_warmup=1, + num_steps=1, + pytorch_models=[], + require_cuda=False, + log_raw_data=False, + ) + defaults.update(arg_overrides) + benchmark._args = SimpleNamespace(**defaults) + # The HF helpers reference the name-mangled cache path; set it explicitly so + # we don't depend on torch.hub.get_dir() in unit tests. + benchmark._ORTInferenceBenchmark__model_cache_path = Path('/tmp/sb-ort-test-cache') + return benchmark + + +# --------------------------------------------------------------------------- +# _preprocess_huggingface_models +# --------------------------------------------------------------------------- + + +def test_preprocess_hf_missing_model_identifier(): + """Missing --model_identifier is rejected before any HF I/O.""" + benchmark = _make_ort_benchmark(model_identifier=None) + + assert benchmark._preprocess_huggingface_models() is False + assert benchmark.return_code == ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE + + +def test_preprocess_hf_invalid_identifier(): + """Path-like / unsafe identifier is rejected by validate_model_identifier.""" + benchmark = _make_ort_benchmark(model_identifier='../etc/passwd') + + assert benchmark._preprocess_huggingface_models() is False + assert benchmark.return_code == ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE + + +def test_preprocess_hf_memory_check_fails(): + """check_memory_fits=False short-circuits with EXECUTION_FAILURE.""" + benchmark = _make_ort_benchmark() + + with patch('transformers.AutoConfig') as mock_auto_config, \ + patch(f'{_ORT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls: + mock_auto_config.from_pretrained.return_value = MagicMock(name='hf_config') + mock_loader_cls.check_memory_fits.return_value = (False, 1000.0, 30.0, 16.0) + + assert benchmark._preprocess_huggingface_models() is False + + assert benchmark.return_code == ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE + mock_auto_config.from_pretrained.assert_called_once() + + +def test_preprocess_hf_auto_config_exception(): + """An exception while downloading the config is converted to failure.""" + benchmark = _make_ort_benchmark() + + with patch('transformers.AutoConfig') as mock_auto_config: + mock_auto_config.from_pretrained.side_effect = RuntimeError('boom') + + assert benchmark._preprocess_huggingface_models() is False + + assert benchmark.return_code == ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE + + +def test_preprocess_hf_happy_path_delegates_to_export(): + """Happy path: config + memory check pass and the export helper runs.""" + benchmark = _make_ort_benchmark() + + fake_hf_config = MagicMock(name='hf_config') + with patch('transformers.AutoConfig') as mock_auto_config, \ + patch(f'{_ORT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls, \ + patch.object(benchmark, '_export_hf_model_to_onnx', return_value=True) as mock_export: + mock_auto_config.from_pretrained.return_value = fake_hf_config + mock_loader_cls.check_memory_fits.return_value = (True, 4.0, 0.02, 16.0) + + assert benchmark._preprocess_huggingface_models() is True + + # AutoConfig is called with trust_remote_code matching --allow_remote_code (False). + config_kwargs = mock_auto_config.from_pretrained.call_args.kwargs + assert config_kwargs['trust_remote_code'] is False + # _hf_config is stashed for __inference() to read vocab_size later. + assert benchmark._hf_config is fake_hf_config + # Memory check uses the runtime precision (float16 here). + mem_args, mem_kwargs = mock_loader_cls.check_memory_fits.call_args + assert mem_args[2] == 'float16' + assert mem_kwargs.get('mode') == 'inference' + # Export helper receives the pre-downloaded config to avoid a redundant fetch. + export_args, _ = mock_export.call_args + assert export_args[2] is fake_hf_config + + +def test_preprocess_hf_int8_uses_float32_for_memory_check(): + """INT8 precision still does the memory check against float32 weights.""" + benchmark = _make_ort_benchmark(precision=Precision.INT8) + + with patch('transformers.AutoConfig') as mock_auto_config, \ + patch(f'{_ORT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls, \ + patch.object(benchmark, '_export_hf_model_to_onnx', return_value=True): + mock_auto_config.from_pretrained.return_value = MagicMock() + mock_loader_cls.check_memory_fits.return_value = (True, 1.0, 0.01, 16.0) + + assert benchmark._preprocess_huggingface_models() is True + + mem_args, _ = mock_loader_cls.check_memory_fits.call_args + assert mem_args[2] == 'float32' + + +def test_preprocess_hf_allow_remote_code_propagates(): + """--allow_remote_code is forwarded as trust_remote_code=True to AutoConfig.""" + benchmark = _make_ort_benchmark(allow_remote_code=True) + + with patch('transformers.AutoConfig') as mock_auto_config, \ + patch(f'{_ORT_MODULE}.HuggingFaceModelLoader') as mock_loader_cls, \ + patch.object(benchmark, '_export_hf_model_to_onnx', return_value=True): + mock_auto_config.from_pretrained.return_value = MagicMock() + mock_loader_cls.check_memory_fits.return_value = (True, 1.0, 0.01, 16.0) + + benchmark._preprocess_huggingface_models() + + assert mock_auto_config.from_pretrained.call_args.kwargs['trust_remote_code'] is True + + +# --------------------------------------------------------------------------- +# _export_hf_model_to_onnx +# --------------------------------------------------------------------------- + + +@pytest.fixture +def mock_export_dependencies(tmp_path): + """Patch the loader, ModelSourceConfig, exporter, and torch.cuda for export tests. + + Yields a SimpleNamespace bundle of mock handles plus the exporter's resolved + ONNX output path, so each test can assert on whichever it needs. + """ + rank_dir = tmp_path / 'checkpoints' + + with patch(f'{_ORT_MODULE}.HuggingFaceModelLoader') as loader_cls, \ + patch(f'{_ORT_MODULE}.ModelSourceConfig') as msc, \ + patch(f'{_ORT_MODULE}.torch.cuda') as torch_cuda: + loader = MagicMock() + loader_cls.return_value = loader + loader.load_model_from_config.return_value = (MagicMock(name='hf_model'), MagicMock(), None) + torch_cuda.is_available.return_value = False + + # Patch the exporter where it is imported (inside _export_hf_model_to_onnx). + with patch('superbench.benchmarks.micro_benchmarks._export_torch_to_onnx.torch2onnxExporter') as exporter_cls: + exporter = MagicMock() + exporter_cls.return_value = exporter + + def _fake_export(model, model_name, batch_size, seq_length, output_dir): + """Simulate a successful ONNX export by writing the file the exporter would produce.""" + out = Path(output_dir) / f'{model_name}.onnx' + out.parent.mkdir(parents=True, exist_ok=True) + out.touch() + return str(out) + + exporter.export_huggingface_model.side_effect = _fake_export + + yield SimpleNamespace( + loader_cls=loader_cls, + loader=loader, + msc=msc, + exporter_cls=exporter_cls, + exporter=exporter, + rank_dir=rank_dir, + ) + + +def test_export_hf_model_to_onnx_fp16_success(mock_export_dependencies, tmp_path): + """fp16 path: ModelSourceConfig dtype=float16, exporter writes ONNX, no quantization.""" + benchmark = _make_ort_benchmark(precision=Precision.FLOAT16) + benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints' + + with patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0'}, clear=False): + ok = benchmark._export_hf_model_to_onnx(hf_token='abc', allow_remote_code=False, hf_config=MagicMock()) + + assert ok is True + # ModelSourceConfig built with float16 (precision dtype) and device_map=None. + msc_kwargs = mock_export_dependencies.msc.call_args.kwargs + assert msc_kwargs['torch_dtype'] == 'float16' + assert msc_kwargs['device_map'] is None + assert msc_kwargs['hf_token'] == 'abc' + # load_model_from_config is invoked with the pre-downloaded config to skip a redundant fetch. + load_kwargs = mock_export_dependencies.loader.load_model_from_config.call_args.kwargs + assert load_kwargs['device'] == 'cpu' + assert load_kwargs['config_pretrained'] is not None + # Exporter receives precision-tagged model name and the rank-scoped output dir. + export_kwargs = mock_export_dependencies.exporter.export_huggingface_model.call_args.kwargs + assert export_kwargs['model_name'] == 'prajjwal1_bert-tiny.float16' + assert export_kwargs['output_dir'].endswith('rank_0') + assert export_kwargs['batch_size'] == 8 + assert export_kwargs['seq_length'] == 128 + # pytorch_models is rewritten to the bare HF id (no precision suffix). + assert benchmark._args.pytorch_models == ['prajjwal1_bert-tiny'] + # Cache path now points at the rank subdirectory. + assert str(benchmark._ORTInferenceBenchmark__model_cache_path).endswith('rank_0') + + +def test_export_hf_model_to_onnx_int8_invokes_quantize(mock_export_dependencies, tmp_path): + """INT8 path: ONNX is exported as float32 first, then quantize_dynamic is called.""" + benchmark = _make_ort_benchmark(precision=Precision.INT8) + benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints' + + fake_quantize_module = MagicMock() + with patch.dict('sys.modules', {'onnxruntime.quantization': fake_quantize_module}), \ + patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0'}, clear=False): + ok = benchmark._export_hf_model_to_onnx(hf_token=None, allow_remote_code=False, hf_config=MagicMock()) + + assert ok is True + # ModelSourceConfig dtype is float32 because INT8 is generated post-export. + msc_kwargs = mock_export_dependencies.msc.call_args.kwargs + assert msc_kwargs['torch_dtype'] == 'float32' + # Exporter wrote the float32 ONNX, then quantize_dynamic was called with that file. + export_kwargs = mock_export_dependencies.exporter.export_huggingface_model.call_args.kwargs + assert export_kwargs['model_name'] == 'prajjwal1_bert-tiny.float32' + fake_quantize_module.quantize_dynamic.assert_called_once() + quantize_args = fake_quantize_module.quantize_dynamic.call_args.args + assert quantize_args[0].endswith('prajjwal1_bert-tiny.float32.onnx') + assert quantize_args[1].endswith('prajjwal1_bert-tiny.int8.onnx') + + +def test_export_hf_model_to_onnx_export_failure(mock_export_dependencies, tmp_path): + """If exporter returns falsy, the helper fails without touching pytorch_models.""" + benchmark = _make_ort_benchmark() + benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints' + mock_export_dependencies.exporter.export_huggingface_model.side_effect = None + mock_export_dependencies.exporter.export_huggingface_model.return_value = None + + with patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0'}, clear=False): + ok = benchmark._export_hf_model_to_onnx(hf_token=None, allow_remote_code=False, hf_config=MagicMock()) + + assert ok is False + assert benchmark.return_code == ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE + assert benchmark._args.pytorch_models == [] + + +def test_export_hf_model_to_onnx_uses_proc_rank_env(mock_export_dependencies, tmp_path): + """PROC_RANK env var (or CUDA_VISIBLE_DEVICES) controls the rank subdirectory.""" + benchmark = _make_ort_benchmark() + benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints' + + with patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0', 'PROC_RANK': '7'}, clear=False): + ok = benchmark._export_hf_model_to_onnx(hf_token=None, allow_remote_code=False, hf_config=MagicMock()) + + assert ok is True + export_kwargs = mock_export_dependencies.exporter.export_huggingface_model.call_args.kwargs + assert export_kwargs['output_dir'].endswith('rank_7') + assert str(benchmark._ORTInferenceBenchmark__model_cache_path).endswith('rank_7') + + +def test_export_hf_model_to_onnx_passes_allow_remote_code_to_loader(mock_export_dependencies, tmp_path): + """allow_remote_code is forwarded to the HuggingFaceModelLoader constructor.""" + benchmark = _make_ort_benchmark() + benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints' + + with patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0'}, clear=False): + benchmark._export_hf_model_to_onnx(hf_token=None, allow_remote_code=True, hf_config=MagicMock()) + + loader_kwargs = mock_export_dependencies.loader_cls.call_args.kwargs + assert loader_kwargs['allow_remote_code'] is True + + +def test_export_hf_model_to_onnx_releases_cuda_cache(mock_export_dependencies, tmp_path): + """When CUDA is available, torch.cuda.empty_cache() is invoked after export.""" + benchmark = _make_ort_benchmark() + benchmark._ORTInferenceBenchmark__model_cache_path = tmp_path / 'checkpoints' + + with patch(f'{_ORT_MODULE}.torch.cuda') as torch_cuda, \ + patch.dict('os.environ', {'CUDA_VISIBLE_DEVICES': '0'}, clear=False): + torch_cuda.is_available.return_value = True + + ok = benchmark._export_hf_model_to_onnx(hf_token=None, allow_remote_code=False, hf_config=MagicMock()) + + assert ok is True + torch_cuda.empty_cache.assert_called_once() diff --git a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py index 441be7af1..6af16dd41 100644 --- a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py +++ b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py @@ -27,9 +27,7 @@ def _make_onnx_input(name, dims): """ return SimpleNamespace( name=name, - type=SimpleNamespace( - tensor_type=SimpleNamespace(shape=SimpleNamespace(dim=[_make_onnx_dim(d) for d in dims])) - ), + type=SimpleNamespace(tensor_type=SimpleNamespace(shape=SimpleNamespace(dim=[_make_onnx_dim(d) for d in dims]))), ) @@ -41,6 +39,7 @@ def _make_onnx_model(inputs, initializer_names=()): class TensorRTInferenceBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase): """Class for tensorrt-inferencee benchmark test cases.""" + @classmethod def setUpClass(cls): """Hook method for setting up class fixture before running tests in the class.""" @@ -195,13 +194,9 @@ def _make_benchmark(self, **arg_overrides): workspace flag already resolved) without actually invoking trtexec or touching the filesystem. """ - (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( - self.benchmark_name, Platform.CUDA - ) + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) benchmark = benchmark_cls(self.benchmark_name, parameters='') - benchmark._result = BenchmarkResult( - self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1 - ) + benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1) defaults = dict( model_source='huggingface', model_identifier='prajjwal1/bert-tiny',