运行时#
- class tensorrt_llm.runtime.ChatGLMGenerationSession(
- model_config: ModelConfig,
- engine_buffer,
- mapping: Mapping,
- debug_mode=False,
- debug_tensors_to_save=None,
- cuda_graph_mode=False,
- stream: Stream = None,
Bases:
GenerationSession
- class tensorrt_llm.runtime.EncDecModelRunner(
- engine_name,
- engine_dir,
- lora_dir=None,
- lora_task_uids=None,
- debug_mode=False,
- skip_encoder=False,
- stream: Stream = None,
- enable_context_fmha_fp32_acc: bool = None,
Bases:
object
- encoder_run(
- input_ids,
- input_lengths,
- max_input_length,
- position_ids=None,
- token_type_ids=None,
- debug_mode=False,
- prompt_embedding_table=None,
- prompt_tasks=None,
- prompt_vocab_size=None,
- attention_mask=None,
- language_adapter_routings=None,
- classmethod from_engine(
- engine_name,
- engine_dir,
- lora_dir=None,
- lora_task_uids=None,
- debug_mode=False,
- skip_encoder=False,
- stream=None,
- enable_context_fmha_fp32_acc=None,
- generate(
- encoder_input_ids,
- decoder_input_ids,
- max_new_tokens,
- num_beams=1,
- pad_token_id=None,
- eos_token_id=None,
- bos_token_id=None,
- debug_mode=False,
- return_dict=False,
- prompt_embedding_table=None,
- prompt_tasks=None,
- prompt_vocab_size=None,
- attention_mask=None,
- time_encoder=False,
- return_encoder_output=False,
- encoder_language_adapter_routings=None,
- decoder_language_adapter_routings=None,
- class tensorrt_llm.runtime.GenerationSession(
- model_config: ModelConfig,
- engine_buffer,
- mapping: Mapping,
- debug_mode=False,
- debug_tensors_to_save=None,
- cuda_graph_mode=False,
- stream: Stream = None,
Bases:
object
- batch_size: int#
- buffer_allocated: bool#
- property context_mem_size: int#
- property conv_kernel#
- property cross_attention#
- cuda_graph_mode: bool#
- debug_mode: bool#
- debug_tensors_to_save: None#
- decode(
- input_ids: Tensor,
- context_lengths: Tensor,
- sampling_config: SamplingConfig,
- prompt_embedding_table: Tensor = None,
- tasks: Tensor = None,
- prompt_vocab_size: Tensor = None,
- stop_words_list=None,
- bad_words_list=None,
- streaming: bool = False,
- output_sequence_lengths: bool = False,
- output_generation_logits: bool = False,
- return_dict: bool = False,
- encoder_output: Tensor = None,
- encoder_input_lengths: Tensor = None,
- stopping_criteria: StoppingCriteria = None,
- logits_processor: LogitsProcessor = None,
- cross_attention_mask: List[Tensor] = None,
- **kwargs,
- decode_batch(
- input_ids: Sequence[Tensor],
- sampling_config: SamplingConfig,
- streaming: bool = False,
- **kwargs,
- decode_regular(
- *,
- batch_size: int,
- scfg: SamplingConfig,
- sequence_lengths: Tensor,
- context_lengths: Tensor,
- host_context_lengths,
- max_context_length: int,
- beam_width: int,
- cache_indirections: list,
- input_ids: Tensor,
- hidden_states: Tensor,
- prompt_embedding_table: Tensor,
- tasks: Tensor,
- prompt_vocab_size: Tensor,
- ite: int,
- sequence_limit_lengths: Tensor,
- stop_words_data,
- bad_words_data,
- output_sequence_lengths: bool = False,
- output_generation_logits: bool = False,
- return_dict: bool = False,
- encoder_output: Tensor = None,
- encoder_input_lengths: Tensor = None,
- stopping_criteria: StoppingCriteria = None,
- logits_processor: LogitsProcessor = None,
- cross_attention_mask: List[Tensor] = None,
- **kwargs,
- decode_stream(
- *,
- batch_size: int,
- scfg: SamplingConfig,
- sequence_lengths: Tensor,
- context_lengths: Tensor,
- host_context_lengths,
- max_context_length: int,
- beam_width: int,
- cache_indirections: list,
- input_ids: Tensor,
- hidden_states: Tensor,
- prompt_embedding_table: Tensor,
- tasks: Tensor,
- prompt_vocab_size: Tensor,
- ite: int,
- sequence_limit_lengths: Tensor,
- stop_words_data,
- bad_words_data,
- output_sequence_lengths: bool = False,
- output_generation_logits: bool = False,
- return_dict: bool = False,
- encoder_output: Tensor = None,
- encoder_input_lengths: Tensor = None,
- stopping_criteria: StoppingCriteria = None,
- logits_processor: LogitsProcessor = None,
- cross_attention_mask: List[Tensor] = None,
- **kwargs,
- device: device#
- property dtype#
- property engine_inspector#
- filter_medusa_logits(
- batch_size,
- best_path,
- best_path_lengths,
- medusa_logits,
medusa_logits 的形状为 [nMH, bs, nMT+1, vocab]
返回 [nMH, bs, vocab]
- property first_layer#
- property gather_context_logits#
- property gather_generation_logits#
- property gemm_allreduce_plugin#
- handle_per_step(
- *,
- cache_indirections: list,
- step: int,
- batch_size: int,
- max_context_length: int,
- beam_width: int,
- input_ids: Tensor,
- hidden_states: Tensor,
- scfg: SamplingConfig,
- kv_cache_block_offsets: Tensor,
- host_kv_cache_block_offsets: Tensor,
- cross_kv_cache_block_offsets: Tensor,
- host_cross_kv_cache_block_offsets: Tensor,
- prompt_embedding_table: Tensor,
- tasks: Tensor,
- context_lengths: Tensor,
- host_context_lengths,
- attention_mask: Tensor,
- cross_attention_mask_for_context: Tensor,
- cross_attention_mask_for_gen: Tensor,
- prompt_vocab_size: Tensor,
- ite: int,
- sequence_limit_lengths: Tensor,
- sequence_lengths: Tensor,
- next_step_tensors: Dict[str, RuntimeTensor],
- stop_words_data,
- bad_words_data,
- encoder_output: Tensor,
- encoder_input_lengths: Tensor,
- stopping_criteria: StoppingCriteria,
- logits_processor: LogitsProcessor,
- output_generation_logits: bool,
- **kwargs,
- property has_position_embedding#
- property has_token_type_embedding#
- property head_size#
- property is_medusa_mode#
- property is_redrafter_mode#
- property kv_cache_type#
- property last_layer#
- mapping: Mapping#
- property max_draft_tokens#
- property max_prompt_embedding_table_size#
- medusa_paths: List[List[int]] = None#
- medusa_position_offsets: List[int] = None#
- medusa_temperature: float = 0.0#
- medusa_topks: List[int] = None#
- medusa_tree_ids: List[int] = None#
- num_draft_tokens: int = 0#
- property num_heads#
- property num_layers#
- property num_medusa_heads#
- property paged_kv_cache#
- property paged_state#
- process_logits_including_draft(
- step,
- batch_size,
- logits,
- next_step_buffer,
将 logits 处理成 token 并验证 (Medusa) 或处理输出 (ReDrafter)
在此处提取提前停止标准:self.accept_length
更新输出 id:需要 self.new_tokens 和 past_sequence_length
获取下一个 input_ids:self.[new_tokens, accept_lengths, medusa_output_tokens]
更新 KV cache:self.[sequence_length, num_draft_tokens]
更新 sequence_length_buffer 和 past_kv_length
- property profiler#
- property quant_mode#
- property remove_input_padding#
- reorder_kv_cache_for_beam_search(
- batch_size: int,
- beam_width: int,
- max_context_length: int,
- step: int,
- property rnn_conv_dim_size#
- property rnn_head_size#
- runtime: _Runtime#
- setup(
- batch_size: int,
- max_context_length: int,
- max_new_tokens: int,
- beam_width: int = 1,
- max_attention_window_size: int | None = None,
- sink_token_length: int | None = None,
- encoder_max_input_length: int | None = None,
- lora_manager: LoraManager = None,
- lora_uids: List[str] = None,
- medusa_choices: List[List[int]] = None,
- multi_block_mode: bool = True,
- enable_context_fmha_fp32_acc: bool = None,
- property state_dtype#
- property state_size#
- property tokens_per_block#
- property use_gemm_allreduce_plugin#
- property use_gpt_attention_plugin#
- property use_kv_cache#
- property use_lora_plugin#
- property use_mamba_conv1d_plugin#
- property vocab_size#
- class tensorrt_llm.runtime.KVCacheManager(
- *,
- num_layers: int,
- num_blocks: int,
- block_size: int,
- tokens_per_block: int,
- max_blocks_per_seq: int,
- max_attention_window_size: int,
- sink_token_len: int,
- beam_width: int = 1,
- use_one_more_block: bool = False,
Bases:
object
- add_sequence(
- sequence: GenerationSequence,
- context_len: int,
- always_share_across_beam: bool = False,
将序列添加到管理器并为上下文分配最少量的块
- class tensorrt_llm.runtime.LogitsProcessorList(iterable=(), /)[source]#
基类:
list
,LogitsProcessor
- class tensorrt_llm.runtime.ModelConfig(
- max_batch_size: int,
- max_beam_width: int,
- vocab_size: int,
- num_layers: int,
- num_heads: int,
- num_kv_heads: int,
- hidden_size: int,
- gpt_attention_plugin: bool,
- gemm_allreduce_plugin: str = None,
- remove_input_padding: bool = False,
- model_name: str = '',
- kv_cache_type: tensorrt_llm.bindings.KVCacheType = <KVCacheType.CONTINUOUS: 0>,
- cross_attention: bool = False,
- head_size: int = None,
- has_position_embedding: bool = True,
- has_token_type_embedding: bool = False,
- tokens_per_block: int = 32,
- max_prompt_embedding_table_size: int = 0,
- quant_mode: tensorrt_llm.quantization.mode.QuantMode = <QuantMode: 0>,
- gather_context_logits: bool = False,
- gather_generation_logits: bool = False,
- dtype: str = '',
- lora_plugin: bool = False,
- lora_target_modules: List[str] = <factory>,
- trtllm_modules_to_hf_modules: dict = None,
- skip_cross_kv: bool = False,
- num_medusa_heads: int = 0,
- max_medusa_tokens: int = 0,
- paged_state: bool = True,
- mamba_conv1d_plugin: bool = True,
- conv_kernel: int = 0,
- layer_types: List[str] = <factory>,
- rnn_hidden_size: int = 0,
- rnn_head_size: int = 0,
- rnn_conv_dim_size: int = 0,
- state_size: int = 0,
- state_dtype: str = '',
- gpu_weights_percent: float = 1.0,
- redrafter_num_beams: int = 0,
- redrafter_draft_len_per_beam: int = 0,
- num_kv_heads_per_layer: Optional[List[int]] = None,
- num_kv_heads_per_cross_attn_layer: Optional[List[int]] = None,
- skip_cross_attn_blocks: bool = False,
- language_adapter_config: Optional[tensorrt_llm.layers.language_adapter.LanguageAdapterConfig] = None,
Bases:
object
- conv_kernel: int = 0#
- cross_attention: bool = False#
- dtype: str = ''#
- gather_context_logits: bool = False#
- gather_generation_logits: bool = False#
- gemm_allreduce_plugin: str = None#
- gpt_attention_plugin: bool#
- gpu_weights_percent: float = 1.0#
- has_position_embedding: bool = True#
- has_token_type_embedding: bool = False#
- head_size: int = None#
- kv_cache_type: KVCacheType = <KVCacheType.CONTINUOUS: 0>#
- language_adapter_config: LanguageAdapterConfig | None = None#
- layer_types: List[str]#
- lora_plugin: bool = False#
- lora_target_modules: List[str]#
- mamba_conv1d_plugin: bool = True#
- max_batch_size: int#
- max_beam_width: int#
- max_medusa_tokens: int = 0#
- max_prompt_embedding_table_size: int = 0#
- model_name: str = ''#
- num_heads: int#
- num_kv_heads: int#
- num_kv_heads_per_cross_attn_layer: List[int] | None = None#
- num_kv_heads_per_layer: List[int] | None = None#
- num_layers: int#
- num_medusa_heads: int = 0#
- paged_state: bool = True#
- redrafter_draft_len_per_beam: int = 0#
- redrafter_num_beams: int = 0#
- remove_input_padding: bool = False#
- rnn_conv_dim_size: int = 0#
- rnn_head_size: int = 0#
- skip_cross_attn_blocks: bool = False#
- skip_cross_kv: bool = False#
- state_dtype: str = ''#
- state_size: int = 0#
- tokens_per_block: int = 32#
- trtllm_modules_to_hf_modules: dict = None#
- vocab_size: int#
- 类 tensorrt_llm.runtime.ModelRunner(
- session: GenerationSession,
- max_batch_size: int,
- max_input_len: int,
- max_seq_len: int,
- max_beam_width: int,
- kv_cache_type: KVCacheType,
- lora_manager: LoraManager | None = None,
基类:
ModelRunnerMixin
一个接口类,它封装了 GenerationSession 并提供生成方法。
- 属性 dtype: dtype#
- 类方法 from_dir(
- engine_dir: str,
- *,
- max_output_len: int | None = None,
- lora_dir: List[str] | None = None,
- rank: int = 0,
- debug_mode: bool = False,
- lora_ckpt_source: str = 'hf',
- medusa_choices: List[List[int]] = None,
- stream: Stream = None,
- gpu_weights_percent: float = 1,
- enable_context_fmha_fp32_acc: bool | None = None,
- multi_block_mode: bool | None = None,
从引擎目录创建一个 ModelRunner 实例。
- 参数:
engine_dir (str) – 包含序列化引擎文件和配置文件目录。
max_output_len (Optional[int]) – 最大输出长度,此参数可能仅在加载时可用,当禁用 KV 缓存时,生成仍会进行检查。
lora_dir (Optional[List[str]]) – 包含 LoRA 权重的目录。
rank (int) – 运行时进程ID。
debug_mode (bool) – 是否开启调试模式。
medusa_choices (List[List[int]]) – 在 Medusa 解码时使用的 Medusa 选项。
stream (torch.cuda.Stream) – 使用的流。
multi_block_mode (bool) – 是否在 GPU 上将工作分配给多个 CUDA 线程块以进行掩码 MHA 内核计算。
- 返回:
ModelRunner 的一个实例。
- 返回类型:
- classmethod from_engine(
- engine: Engine,
- *,
- max_output_len: int | None,
- lora_dir: List[str] | None,
- rank: int,
- debug_mode: bool,
- lora_ckpt_source: str,
- medusa_choices: List[List[int]],
- stream: Stream,
- gpu_weights_percent: float,
- enable_context_fmha_fp32_acc: bool | None,
- multi_block_mode: bool | None,
- 属性 gather_context_logits: bool#
- 属性 gather_generation_logits: bool#
- generate(
- batch_input_ids: List[Tensor],
- position_ids: List[Tensor] = None,
- sampling_config: SamplingConfig | None = None,
- prompt_table: str | Tensor | None = None,
- prompt_tasks: str | None = None,
- lora_uids: list | None = None,
- streaming: bool = False,
- output_generation_logits: bool = False,
- stopping_criteria: StoppingCriteria | None = None,
- logits_processor: LogitsProcessor | None = None,
- medusa_choices: List[List[int]] | None = None,
- encoder_max_input_length: int = None,
- encoder_input_features: List[Tensor] = None,
- encoder_output_lengths: List[Tensor] = None,
- cross_attention_masks: List[Tensor] = None,
- **kwargs,
生成 token ID 序列。生成控制参数在 sampling_config 中设置;如果未传入,则会设置为默认值。您可以通过传入相应的参数来覆盖 sampling_config 的任何属性。
- 参数:
batch_input_ids (List[torch.Tensor]) – 输入 ID 张量的列表。每个张量的形状为 (sequence_length, )。
sampling_config (SamplingConfig) – 用作生成调用基础参数设置的采样配置。传入的 **kwargs(与 sampling_config 的属性匹配)将覆盖这些属性。如果未提供 sampling_config,将使用默认值。
prompt_table (str 或 torch.Tensor) – 提示表的文件路径(.npy 格式,由 nemo_prompt_convert.py 导出)或提示表本身。
prompt_tasks (str) – 输入批次的提示调优任务 ID,格式为逗号分隔列表(例如,0,3,1,0)。
lora_uids (list) – 输入批次的 LoRA 权重 UID。使用 -1 禁用 LoRA 模块。
streaming (bool) – 生成是否使用流式模式。
stopping_criteria (StoppingCriteria) – 自定义停止条件。
logits_processor (LogitsProcessor) – 自定义 logits 处理器。
medusa_choices (List[List[int]]) – Medusa 解码选项。
(Dict[str (kwargs) – sampling_config 的临时参数设置。传入的 **kwargs(与 sampling_config 的属性匹配)将覆盖这些属性。
Any] – sampling_config 的临时参数设置。传入的 **kwargs(与 sampling_config 的属性匹配)将覆盖这些属性。
- 返回:
如果 return_dict=False,方法返回生成的 output_ids。如果 return_dict=True,方法返回一个字典,包含 output_ids、sequence_lengths(如果 sampling_config.output_sequence_lengths=True)、context_logits 和 generation_logits(分别在 self.gather_context_logits=True 和 self.gather_generation_logits=True 时)。
- 返回类型:
torch.Tensor 或 dict
- 属性 mapping: Mapping#
- 属性 max_prompt_embedding_table_size: int#
- 属性 max_sequence_length: int#
- 属性 num_heads: int#
- 属性 num_layers: int#
- 属性 remove_input_padding: bool#
- 属性 use_lora_plugin: bool#
- 属性 vocab_size: int#
- 属性 vocab_size_padded: int#
- 类 tensorrt_llm.runtime.ModelRunnerCpp(
- executor: Executor,
- max_batch_size: int,
- max_input_len: int,
- max_seq_len: int,
- max_beam_width: int,
- model_config: ModelConfig,
- world_config: WorldConfig,
- use_kv_cache: bool,
- lora_manager: LoraManager | None = None,
基类:
ModelRunnerMixin
一个接口类,它封装了 Executor 并提供生成方法。
- 属性 dtype: dtype#
- 类方法 from_dir(
- engine_dir: str,
- *,
- lora_dir: str | None = None,
- rank: int = 0,
- max_batch_size: int | None = None,
- max_input_len: int | None = None,
- max_output_len: int | None = None,
- max_beam_width: int | None = None,
- max_attention_window_size: list[int] | None = None,
- sink_token_length: int | None = None,
- kv_cache_free_gpu_memory_fraction: float | None = None,
- cross_kv_cache_fraction: float | None = None,
- medusa_choices: list[list[int]] | None = None,
- eagle_choices: list[list[int]] | None = None,
- eagle_posterior_threshold: float | None = None,
- eagle_use_dynamic_tree: bool = False,
- eagle_dynamic_tree_max_top_k: int | None = None,
- lookahead_config: list[int] | None = None,
- debug_mode: bool = False,
- lora_ckpt_source: str = 'hf',
- use_gpu_direct_storage: bool = False,
- gpu_weights_percent: float = 1,
- max_tokens_in_paged_kv_cache: int | None = None,
- kv_cache_enable_block_reuse: bool = False,
- enable_chunked_context: bool = False,
- is_enc_dec: bool = False,
- multi_block_mode: bool = True,
- enable_context_fmha_fp32_acc: bool | None = None,
- cuda_graph_mode: bool | None = None,
- logits_processor_map: Dict[str, LogitsProcessor] | None = None,
- device_ids: List[int] | None = None,
- is_orchestrator_mode: bool = False,
- use_runtime_defaults: bool = True,
- gather_generation_logits: bool = False,
- use_variable_beam_width_search: bool = False,
- mm_embedding_offloading: bool = False,
从引擎目录创建一个 ModelRunnerCpp 实例。
- 参数:
engine_dir (str) – 包含序列化引擎文件和配置文件目录。
lora_dir (str) – 包含 LoRA 权重的目录。
rank (int) – 运行时进程ID。
max_batch_size (int) – 运行时批量大小限制。如果 max_batch_size 不是 None,它不应大于引擎的 max_batch_size;否则将使用引擎的 max_batch_size。
max_input_len (int) – 运行时输入长度限制。如果 max_input_len 不是 None,它不应大于引擎的 max_input_len;否则将使用引擎的 max_input_len。
max_output_len (int) – 运行时输出长度限制。如果 max_output_len 不是 None,它不应大于引擎的 max_output_len;否则将使用引擎的 max_output_len。
max_beam_width (int) – 运行时 beam width 限制。如果 max_beam_width 不是 None,它不应大于引擎的 max_beam_width;否则将使用引擎的 max_beam_width。
max_attention_window_size (List[int]) – 控制滑动窗口注意力 / 循环 KV 缓存行为的注意力窗口大小。
sink_token_length (int) – sink token 长度,默认为 0。
kv_cache_free_gpu_memory_fraction (float) – KV 缓存使用的空闲 GPU 内存比例。
cross_kv_cache_fraction (float) – 为交叉注意力保留的 KV 缓存比例,仅应与 enc-dec 模型一起使用。
debug_mode (bool) – 是否开启调试模式。
medusa_choices (List[List[int]]) – 在 Medusa 解码中使用的 Medusa 选项。
eagle_choices (List[List[int]]) – 在 Eagle-1 解码中使用的 Eagle 选项。
float (eagle_posterior_threshold) – 典型接受的最小 token 概率阈值。非 None 值将在 Eagle 中启用典型接受。
bool (eagle_use_dynamic_tree) – 是否使用 Eagle-2 (动态树)。
int (eagle_dynamic_tree_max_top_k) – 在 Eagle-2 中每个节点展开的草稿 token 的最大数量。
lora_ckpt_source (str) – checkpoint 的来源。应为 ['hf', 'nemo'] 之一。
max_tokens_in_paged_kv_cache (int) – 在 KV 缓存中配置的最大 token 数量。
kv_cache_enable_block_reuse (bool) – 在 KV 缓存中启用块重用。
enable_chunked_context (bool) – 启用分块上下文。
is_enc_dec (bool) – 模型是否为编码器-解码器架构。
multi_block_mode (bool) – 是否在 GPU 上将工作分配给多个 CUDA 线程块以进行掩码 MHA 内核计算。
enable_context_fmha_fp32_acc (bool) – 启用 FMHA 运行器的 FP32 累加。
cuda_graph_mode (bool) – 推理时是否使用 cuda graph。
logits_processor_map (Dict[str, LogitsProcessor]) – 一个 logits processor 函数映射,按名称索引。稍后可以将名称提供给 generate() 函数以指定要运行的 logits processor。
device_ids (List[int]) – 用于运行 Executor 的设备索引。
is_orchestrator_mode (bool) – 运行 model-runner 的模式,默认为 Leader 模式。
gather_generation_logits (bool) – 启用收集生成的 logits。
- 返回:
ModelRunnerCpp 的实例。
- 返回类型:
- 属性 gather_context_logits: bool#
- 属性 gather_generation_logits: bool#
- generate(
- batch_input_ids: List[Tensor],
- *,
- position_ids: List[Tensor] = None,
- encoder_input_ids: List[Tensor] = None,
- encoder_input_features: List[Tensor] = None,
- encoder_output_lengths: List[int] = None,
- cross_attention_masks: List[Tensor] = None,
- mrope_params: MropeParams | None = None,
- sampling_config: SamplingConfig | None = None,
- lora_uids: list | None = None,
- lookahead_config: list[int] | None = None,
- streaming: bool = False,
- stopping_criteria: StoppingCriteria | None = None,
- logits_processor_names: list[str] | None = None,
- max_new_tokens: int = 1,
- end_id: int | None = None,
- pad_id: int | None = None,
- bad_words_list: list[list[int]] | None = None,
- stop_words_list: list[list[int]] | None = None,
- return_dict: bool = False,
- output_sequence_lengths: bool = False,
- output_generation_logits: bool = False,
- output_log_probs: bool = False,
- output_cum_log_probs: bool = False,
- prompt_table: str | Tensor | None = None,
- prompt_tasks: str | None = None,
- input_token_extra_ids: List[List[int]] = None,
- return_all_generated_tokens: bool = False,
- language_adapter_uids: List[int] | None = None,
- mm_embedding_offloading: bool = False,
- **kwargs,
生成 token ID 序列。生成控制参数在 sampling_config 中设置;如果未传入,则会设置为默认值。您可以通过传入相应的参数来覆盖 sampling_config 的任何属性。
- 参数:
batch_input_ids (List[torch.Tensor]) – 输入 ID 张量的列表。每个张量的形状为 (sequence_length, )。
position_ids (List[torch.Tensor]) – position id 张量的列表。每个张量的形状为 (sequence_length, )。
encoder_input_ids (List[torch.Tensor]) – 编码器-解码器模型的编码器输入 id 张量列表(可选)。每个张量的形状为 (sequence_length, )。
encoder_input_features – (List[torch.Tensor]): 多模态编码器-解码器模型的编码器输入特征张量列表(可选)。每个张量的形状为 (sequence_length, feature_dim)。
encoder_output_lengths – (List[int]): 编码器输出长度列表(可选),如果编码器输出长度与编码器输入长度不同(由于卷积下采样等原因)。
sampling_config (SamplingConfig) – 用作生成调用的基本参数化的采样配置。传递的与 sampling_config 属性匹配的 **kwargs 将覆盖它们。如果未提供 sampling_config,将使用默认配置。
prompt_table (str 或 torch.Tensor) – 提示表的文件路径(.npy 格式,由 nemo_prompt_convert.py 导出)或提示表本身。
prompt_tasks (str) – 输入批次的提示调优任务 ID,格式为逗号分隔列表(例如,0,3,1,0)。
input_token_extra_ids (List[List[int]]) – 用于 p-tuning 和 KV 缓存重用一起使用的输入 token 额外 id。
lora_uids (list) – 输入批次的 LoRA 权重 UID。使用 -1 禁用 LoRA 模块。
streaming (bool) – 生成是否使用流式模式。
stopping_criteria (StoppingCriteria) – 自定义停止条件。
logits_processor_names (List[str]) – 自定义 logits processor 名称。
return_all_generated_tokens (bool) – 在每个流式步骤中是否返回完整输出。
(Dict[str (kwargs) – sampling_config 的临时参数化。传递的与 sampling_config 属性匹配的 **kwargs 将覆盖它们。
Any] – sampling_config 的临时参数化。传递的与 sampling_config 属性匹配的 **kwargs 将覆盖它们。
- 返回:
如果 return_dict=False,方法返回生成的 output_ids。如果 return_dict=True,方法返回一个字典,包含 output_ids、sequence_lengths(如果 sampling_config.output_sequence_lengths=True)、context_logits 和 generation_logits(分别在 self.gather_context_logits=True 和 self.gather_generation_logits=True 时)。
- 返回类型:
torch.Tensor 或 dict
- 属性 max_prompt_embedding_table_size: int#
- 属性 max_sequence_length: int#
- 属性 num_heads: int#
- 属性 num_layers: int#
- 属性 remove_input_padding: bool#
- 属性 vocab_size: int#
- 属性 vocab_size_padded: int#
- 类 tensorrt_llm.runtime.MultimodalModelRunner(args)[source]#
Bases:
object
- 属性 audio_engine_dir#
- 属性 cpp_e2e#
- 属性 cpp_llm_only#
- generate(
- pre_prompt,
- post_prompt,
- image,
- decoder_input_ids,
- max_new_tokens,
- other_vision_inputs={},
- other_audio_inputs={},
- other_decoder_inputs={},
- get_rope_index(
- input_ids: LongTensor,
- image_grid_thw: LongTensor | None = None,
- video_grid_thw: LongTensor | None = None,
- attention_mask: Tensor | None = None,
根据图像和视频在 LLM 中的时间、高度和宽度计算 3D rope 索引。
- 解释
每个嵌入序列包含视觉嵌入和文本嵌入,或者仅包含文本嵌入。
对于纯文本嵌入序列,旋转位置嵌入与现代 LLM 没有区别。示例
input_ids: [T T T T T],其中 T 代表文本。时间 position_ids: [0, 1, 2, 3, 4] 高度 position_ids: [0, 1, 2, 3, 4] 宽度 position_ids: [0, 1, 2, 3, 4]
对于视觉和文本嵌入序列,我们为视觉部分计算 3D 旋转位置嵌入,为文本部分计算 1D 旋转位置嵌入。示例
假设我们有一个视频输入,包含 3 个时间补丁、2 个高度补丁和 2 个宽度补丁。input_ids: [V V V V V V V V V V V V T T T T T],其中 V 代表视觉。视觉时间 position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] 视觉高度 position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] 视觉宽度 position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] 文本时间 position_ids: [3, 4, 5, 6, 7] 文本高度 position_ids: [3, 4, 5, 6, 7] 文本宽度 position_ids: [3, 4, 5, 6, 7] 这里我们将文本开始 position_ids 计算为最大视觉 position_ids 加 1。
- 参数:
input_ids (torch.LongTensor of shape (batch_size, sequence_length)) – 输入序列 token 在词汇表中的索引。如果提供填充,默认情况下将忽略填充。
image_grid_thw (torch.LongTensor of shape (num_images, 3),可选) – LLM 中每张图像的特征形状的时间、高度和宽度。
video_grid_thw (torch.LongTensor of shape (num_videos, 3),可选) – LLM 中每个视频的特征形状的时间、高度和宽度。
attention_mask (torch.Tensor of shape (batch_size, sequence_length),可选) –
用于避免在填充 token 索引上执行注意力的掩码。掩码值选择范围为 [0, 1]
对于未被掩码的 token 为 1,
对于被掩码的 token 为 0。
- 返回:
position_ids (torch.LongTensor of shape (3, batch_size, sequence_length)) mrope_position_deltas (torch.Tensor of shape (batch_size))
- 属性 llm_engine_dir#
- ptuning_setup_phi3(
- visual_features,
- audio_features,
- input_ids,
- num_img_tokens,
- num_aud_tokens,
- 属性 python_e2e#
- setup_fake_prompts_qwen2vl(
- visual_features,
- input_ids,
- vision_grid_thws,
- attention_mask,
- input_lengths,
- 静态方法 tokenizer_image_token(
- batch_size,
- pre_prompt,
- post_prompt,
- tokenizer,
- image_token_index=-200,
- 属性 visual_engine_dir#
- 类 tensorrt_llm.runtime.QWenForCausalLMGenerationSession(
- model_config: ModelConfig,
- engine_buffer,
- mapping: Mapping,
- debug_mode=False,
- debug_tensors_to_save=None,
- cuda_graph_mode=False,
- stream: Stream = None,
- global_max_input_length: int = 2048,
- global_max_output_length: int = 4096,
Bases:
GenerationSession
- generate(
- input_ids: Tensor,
- input_lengths: Tensor,
- sampling_config: SamplingConfig,
- max_new_tokens: int,
- runtime_rank: int = 0,
- 类 tensorrt_llm.runtime.SamplingConfig(
- end_id: int,
- pad_id: int,
- max_new_tokens: int = 20,
- num_beams: int = 1,
- num_return_sequences: int | None = None,
- max_attention_window_size: int | None = None,
- sink_token_length: int | None = None,
- output_sequence_lengths: bool = False,
- return_dict: bool = False,
- stop_words_list: list | numpy.ndarray | torch.Tensor | NoneType = None,
- bad_words_list: list | numpy.ndarray | torch.Tensor | NoneType = None,
- temperature: float | torch.Tensor = 1.0,
- top_k: int | torch.Tensor = 1,
- top_p: float | torch.Tensor = 0.0,
- top_p_decay: torch.Tensor | None = None,
- top_p_min: torch.Tensor | None = None,
- top_p_reset_ids: torch.Tensor | None = None,
- random_seed: int | torch.Tensor = None,
- length_penalty: float | torch.Tensor = 1.0,
- early_stopping: int | torch.Tensor = 1,
- repetition_penalty: float | torch.Tensor = 1.0,
- min_length: int | torch.Tensor = 1,
- presence_penalty: float | torch.Tensor = 0.0,
- frequency_penalty: float | torch.Tensor = 0.0,
- use_beam_hyps: bool = True,
- min_p: float | torch.Tensor = 0.0,
Bases:
object
- bad_words_list: list | ndarray | Tensor | None = None#
- beam_search_diversity_rate: float | Tensor = 0.0#
- early_stopping: int | Tensor = 1#
- end_id: int#
- frequency_penalty: float | Tensor = 0.0#
- length_penalty: float | Tensor = 1.0#
- max_attention_window_size: int | None = None#
- max_new_tokens: int = 20#
- min_length: int | Tensor = 1#
- min_p: float | Tensor = 0.0#
- no_repeat_ngram_size: int | Tensor = None#
- num_beams: int = 1#
- num_return_sequences: int | None = None#
- output_cum_log_probs: bool = False#
- output_log_probs: bool = False#
- output_sequence_lengths: bool = False#
- pad_id: int#
- presence_penalty: float | Tensor = 0.0#
- random_seed: int | Tensor = None#
- repetition_penalty: float | Tensor = 1.0#
- return_dict: bool = False#
- sink_token_length: int | None = None#
- stop_words_list: list | ndarray | Tensor | None = None#
- temperature: float | Tensor = 1.0#
- top_k: int | Tensor = 1#
- top_p: float | Tensor = 0.0#
- top_p_decay: Tensor | None = None#
- top_p_min: Tensor | None = None#
- top_p_reset_ids: Tensor | None = None#
- use_beam_hyps: bool = True#
- class tensorrt_llm.runtime.Session(**kwargs)[source]#
Bases:
object
会话是一个托管的 TensorRT 运行时。
- property context: IExecutionContext#
- 获取默认的 TensorRT 执行上下文,
如果需要创建新的上下文,请使用 self.engine.create_execution_context()
@return: 一个 TensorRT 执行上下文对象
- 类型:
@brief
- property context_mem_size: int#
- property engine: ICudaEngine#
- static from_engine(engine) Session [source]#
@brief: 从现有的 ICudaEngine 引擎创建会话 @param engine: 一个 ICudaEngine @return: 一个 Session 对象
- static from_serialized_engine(
- engine,
@brief: 从序列化引擎创建会话 @param engine: 一个序列化引擎 @return: 一个 Session 对象
- infer_shapes(
- inputs: List[TensorInfo],
- context: IExecutionContext | None = None,
- @brief: 为给定上下文设置输入形状,并从给定的输入形状推断输出形状。
在调用 run() 之前,每当输入形状发生变化时都应调用此函数。或者手动对所有动态形状的输入张量调用 context.set_input_shape。
@param inputs: TensorInfo 对象的列表,每个项表示一个输入张量 @param context: TensorRT 执行上下文,如果为 None,则使用默认上下文 @return: TensorInfo 对象的列表,每个项表示一个输出张量,如果失败则返回 None
- run(
- inputs: Dict[str, Any],
- outputs: Dict[str, Any],
- stream,
- context=None,
@brief: 使用给定的输入和输出运行 TensorRT 引擎 @param inputs: 输入张量的字典,key 是张量名称,value 是张量指针或 torch 张量 @param outputs: 输出张量的字典,key 是张量名称,value 是张量指针或 torch 张量 @param stream: 用于将 TensorRT 引擎排队的 cuda 流 @param context: TensorRT 执行上下文,如果为 None,则使用默认上下文 @return: 如果排队成功则为 True,请注意排队是一个异步调用,
返回 True 不表示执行已完成
- property runtime: Runtime#
- class tensorrt_llm.runtime.StoppingCriteriaList(iterable=(), /)[source]#
Bases:
list
,StoppingCriteria