运行时#

class tensorrt_llm.runtime.ChatGLMGenerationSession( model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream = None, )[source]#: Bases: GenerationSession

class tensorrt_llm.runtime.EncDecModelRunner( engine_name, engine_dir, lora_dir=None, lora_task_uids=None, debug_mode=False, skip_encoder=False, stream: Stream = None, enable_context_fmha_fp32_acc: bool = None, )[source]#

Bases: object

encoder_run( input_ids, input_lengths, max_input_length, position_ids=None, token_type_ids=None, debug_mode=False, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, attention_mask=None, language_adapter_routings=None, )[source]#

classmethod from_engine( engine_name, engine_dir, lora_dir=None, lora_task_uids=None, debug_mode=False, skip_encoder=False, stream=None, enable_context_fmha_fp32_acc=None, )[source]#

generate( encoder_input_ids, decoder_input_ids, max_new_tokens, num_beams=1, pad_token_id=None, eos_token_id=None, bos_token_id=None, debug_mode=False, return_dict=False, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, attention_mask=None, time_encoder=False, return_encoder_output=False, encoder_language_adapter_routings=None, decoder_language_adapter_routings=None, )[source]#

process_input( input_ids, remove_input_padding=False, pad_token_id=0, prompt_tasks=None, language_adapter_routings=None, )[source]#

class tensorrt_llm.runtime.GenerationSequence(seq_idx, batch_idx)[source]#

Bases: object

get_batch_idx() → int[source]#: 返回批次中的序列索引

get_seq_idx() → int[source]#: 返回序列索引

class tensorrt_llm.runtime.GenerationSession( model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream = None, )[source]#

Bases: object

batch_size: int#

buffer_allocated: bool#

property context_mem_size: int#

property conv_kernel#

property cross_attention#

cuda_graph_mode: bool#

cuda_stream_guard()[source]#: 同步外部流并将当前流设置为与会话绑定的流。退出时重置。

debug_mode: bool#

debug_tensors_to_save: None#

decode(

input_ids: Tensor,

context_lengths: Tensor,

sampling_config: SamplingConfig,

prompt_embedding_table: Tensor = None,

tasks: Tensor = None,

prompt_vocab_size: Tensor = None,

stop_words_list=None,

bad_words_list=None,

streaming: bool = False,

output_sequence_lengths: bool = False,

output_generation_logits: bool = False,

return_dict: bool = False,

encoder_output: Tensor = None,

encoder_input_lengths: Tensor = None,

stopping_criteria: StoppingCriteria = None,

logits_processor: LogitsProcessor = None,

cross_attention_mask: List[Tensor] = None,

**kwargs,

)[source]#

decode_batch(

input_ids: Sequence[Tensor],

sampling_config: SamplingConfig,

streaming: bool = False,

**kwargs,

)[source]#

decode_regular(

*,

batch_size: int,

scfg: SamplingConfig,

sequence_lengths: Tensor,

context_lengths: Tensor,

host_context_lengths,

max_context_length: int,

beam_width: int,

cache_indirections: list,

input_ids: Tensor,

hidden_states: Tensor,

prompt_embedding_table: Tensor,

tasks: Tensor,

prompt_vocab_size: Tensor,

ite: int,

sequence_limit_lengths: Tensor,

stop_words_data,

bad_words_data,

output_sequence_lengths: bool = False,

output_generation_logits: bool = False,

return_dict: bool = False,

encoder_output: Tensor = None,

encoder_input_lengths: Tensor = None,

stopping_criteria: StoppingCriteria = None,

logits_processor: LogitsProcessor = None,

cross_attention_mask: List[Tensor] = None,

**kwargs,

)[source]#

decode_stream(

*,

batch_size: int,

scfg: SamplingConfig,

sequence_lengths: Tensor,

context_lengths: Tensor,

host_context_lengths,

max_context_length: int,

beam_width: int,

cache_indirections: list,

input_ids: Tensor,

hidden_states: Tensor,

prompt_embedding_table: Tensor,

tasks: Tensor,

prompt_vocab_size: Tensor,

ite: int,

sequence_limit_lengths: Tensor,

stop_words_data,

bad_words_data,

output_sequence_lengths: bool = False,

output_generation_logits: bool = False,

return_dict: bool = False,

encoder_output: Tensor = None,

encoder_input_lengths: Tensor = None,

stopping_criteria: StoppingCriteria = None,

logits_processor: LogitsProcessor = None,

cross_attention_mask: List[Tensor] = None,

**kwargs,

)[source]#

device: device#

property dtype#

dump_debug_buffers(step: int) → None[source]#

early_stop_criteria(batch_size, step, should_stop)[source]#

property engine_inspector#

filter_medusa_logits( batch_size, best_path, best_path_lengths, medusa_logits, )[source]#: medusa_logits 的形状为 [nMH, bs, nMT+1, vocab]

返回 [nMH, bs, vocab]

finalize_decoder( context_lengths, batch_size, beam_width, scfg, in_progress=False, )[source]#

find_best_medusa_path( batch_size, input_ids: Tensor, next_logits, temp=0, )[source]#

property first_layer#

property gather_context_logits#

property gather_generation_logits#

property gemm_allreduce_plugin#

get_next_medusa_tokens( batch_size, next_medusa_logits, )[source]#

get_num_heads_kv( layer_idx: int | None = None, ) → int[source]#

handle_per_step(

*,

cache_indirections: list,

step: int,

batch_size: int,

max_context_length: int,

beam_width: int,

input_ids: Tensor,

hidden_states: Tensor,

scfg: SamplingConfig,

kv_cache_block_offsets: Tensor,

host_kv_cache_block_offsets: Tensor,

cross_kv_cache_block_offsets: Tensor,

host_cross_kv_cache_block_offsets: Tensor,

prompt_embedding_table: Tensor,

tasks: Tensor,

context_lengths: Tensor,

host_context_lengths,

attention_mask: Tensor,

cross_attention_mask_for_context: Tensor,

cross_attention_mask_for_gen: Tensor,

prompt_vocab_size: Tensor,

ite: int,

sequence_limit_lengths: Tensor,

sequence_lengths: Tensor,

next_step_tensors: Dict[str, RuntimeTensor],

stop_words_data,

bad_words_data,

encoder_output: Tensor,

encoder_input_lengths: Tensor,

stopping_criteria: StoppingCriteria,

logits_processor: LogitsProcessor,

output_generation_logits: bool,

**kwargs,

)[source]#

property has_position_embedding#

property has_token_type_embedding#

property head_size#

property hidden_size#

property is_medusa_mode#

property is_redrafter_mode#

property kv_cache_type#

property last_layer#

locate_accepted_draft_tokens( batch_size, best_path, best_path_len, draft_paths, )[source]#

mapping: Mapping#

property max_draft_tokens#

property max_prompt_embedding_table_size#

medusa_decode_and_verify(step, batch_size, logits)[source]#

medusa_paths: List[List[int]] = None#

medusa_position_offsets: List[int] = None#

medusa_temperature: float = 0.0#

medusa_topks: List[int] = None#

medusa_tree_ids: List[int] = None#

next_medusa_input_ids()[source]#

num_draft_tokens: int = 0#

property num_heads#

property num_layers#

property num_medusa_heads#

property paged_kv_cache#

property paged_state#

pp_communicate_final_output_ids( final_output_ids, batch_size, beam_width, )[source]#

pp_communicate_new_tokens( should_stop, cache_indir, sequence_length, )[source]#

process_logits_including_draft( step, batch_size, logits, next_step_buffer, )[source]#

将 logits 处理成 token 并验证 (Medusa) 或处理输出 (ReDrafter)
在此处提取提前停止标准：self.accept_length
更新输出 id：需要 self.new_tokens 和 past_sequence_length
获取下一个 input_ids：self.[new_tokens, accept_lengths, medusa_output_tokens]
更新 KV cache：self.[sequence_length, num_draft_tokens]
更新 sequence_length_buffer 和 past_kv_length

property profiler#

property quant_mode#

property remove_input_padding#

reorder_kv_cache_for_beam_search( batch_size: int, beam_width: int, max_context_length: int, step: int, )[source]#

property rnn_conv_dim_size#

property rnn_head_size#

property rnn_hidden_size#

runtime: _Runtime#

setup( batch_size: int, max_context_length: int, max_new_tokens: int, beam_width: int = 1, max_attention_window_size: int | None = None, sink_token_length: int | None = None, encoder_max_input_length: int | None = None, lora_manager: LoraManager = None, lora_uids: List[str] = None, medusa_choices: List[List[int]] = None, multi_block_mode: bool = True, enable_context_fmha_fp32_acc: bool = None, )[source]#

property state_dtype#

property state_size#

property tokens_per_block#

update_output_ids_by_offset( new_generated_ids, offsets, )[source]#

property use_gemm_allreduce_plugin#

property use_gpt_attention_plugin#

property use_kv_cache#

property use_lora_plugin#

property use_mamba_conv1d_plugin#

property vocab_size#

class tensorrt_llm.runtime.KVCacheManager( *, num_layers: int, num_blocks: int, block_size: int, tokens_per_block: int, max_blocks_per_seq: int, max_attention_window_size: int, sink_token_len: int, beam_width: int = 1, use_one_more_block: bool = False, )[source]#

Bases: object

add_sequence( sequence: GenerationSequence, context_len: int, always_share_across_beam: bool = False, )[source]#: 将序列添加到管理器并为上下文分配最少量的块

get_block_offsets(beam_width: int) → Tensor[source]#: 返回内存池中的偏移量数组

step(finished: List[bool])[source]#: 迭代到下一个生成步骤。在需要的地方添加新的块并清除已完成的序列。

class tensorrt_llm.runtime.LogitsProcessor[source]#

Bases: object

所有可在生成过程中应用的 logits 处理器的基类。

class tensorrt_llm.runtime.LogitsProcessorList(iterable=(), /)[source]#: 基类：list, LogitsProcessor

class tensorrt_llm.runtime.ModelConfig( max_batch_size: int, max_beam_width: int, vocab_size: int, num_layers: int, num_heads: int, num_kv_heads: int, hidden_size: int, gpt_attention_plugin: bool, gemm_allreduce_plugin: str = None, remove_input_padding: bool = False, model_name: str = '', kv_cache_type: tensorrt_llm.bindings.KVCacheType = <KVCacheType.CONTINUOUS: 0>, cross_attention: bool = False, head_size: int = None, has_position_embedding: bool = True, has_token_type_embedding: bool = False, tokens_per_block: int = 32, max_prompt_embedding_table_size: int = 0, quant_mode: tensorrt_llm.quantization.mode.QuantMode = <QuantMode: 0>, gather_context_logits: bool = False, gather_generation_logits: bool = False, dtype: str = '', lora_plugin: bool = False, lora_target_modules: List[str] = <factory>, trtllm_modules_to_hf_modules: dict = None, skip_cross_kv: bool = False, num_medusa_heads: int = 0, max_medusa_tokens: int = 0, paged_state: bool = True, mamba_conv1d_plugin: bool = True, conv_kernel: int = 0, layer_types: List[str] = <factory>, rnn_hidden_size: int = 0, rnn_head_size: int = 0, rnn_conv_dim_size: int = 0, state_size: int = 0, state_dtype: str = '', gpu_weights_percent: float = 1.0, redrafter_num_beams: int = 0, redrafter_draft_len_per_beam: int = 0, num_kv_heads_per_layer: Optional[List[int]] = None, num_kv_heads_per_cross_attn_layer: Optional[List[int]] = None, skip_cross_attn_blocks: bool = False, language_adapter_config: Optional[tensorrt_llm.layers.language_adapter.LanguageAdapterConfig] = None, )[source]#

Bases: object

conv_kernel: int = 0#

cross_attention: bool = False#

dtype: str = ''#

gather_context_logits: bool = False#

gather_generation_logits: bool = False#

gemm_allreduce_plugin: str = None#

gpt_attention_plugin: bool#

gpu_weights_percent: float = 1.0#

has_position_embedding: bool = True#

has_token_type_embedding: bool = False#

head_size: int = None#

hidden_size: int#

kv_cache_type: KVCacheType = <KVCacheType.CONTINUOUS: 0>#

language_adapter_config: LanguageAdapterConfig | None = None#

layer_types: List[str]#

lora_plugin: bool = False#

lora_target_modules: List[str]#

mamba_conv1d_plugin: bool = True#

max_batch_size: int#

max_beam_width: int#

max_medusa_tokens: int = 0#

max_prompt_embedding_table_size: int = 0#

model_name: str = ''#

num_heads: int#

num_kv_heads: int#

num_kv_heads_per_cross_attn_layer: List[int] | None = None#

num_kv_heads_per_layer: List[int] | None = None#

num_layers: int#

num_medusa_heads: int = 0#

paged_state: bool = True#

quant_mode: QuantMode = 0#

redrafter_draft_len_per_beam: int = 0#

redrafter_num_beams: int = 0#

remove_input_padding: bool = False#

rnn_conv_dim_size: int = 0#

rnn_head_size: int = 0#

rnn_hidden_size: int = 0#

skip_cross_attn_blocks: bool = False#

skip_cross_kv: bool = False#

state_dtype: str = ''#

state_size: int = 0#

tokens_per_block: int = 32#

trtllm_modules_to_hf_modules: dict = None#

vocab_size: int#

类 tensorrt_llm.runtime.ModelRunner( session: GenerationSession, max_batch_size: int, max_input_len: int, max_seq_len: int, max_beam_width: int, kv_cache_type: KVCacheType, lora_manager: LoraManager | None = None, )[source]#

基类: ModelRunnerMixin

一个接口类，它封装了 GenerationSession 并提供生成方法。

属性 dtype: dtype#

类方法 from_dir( engine_dir: str, *, max_output_len: int | None = None, lora_dir: List[str] | None = None, rank: int = 0, debug_mode: bool = False, lora_ckpt_source: str = 'hf', medusa_choices: List[List[int]] = None, stream: Stream = None, gpu_weights_percent: float = 1, enable_context_fmha_fp32_acc: bool | None = None, multi_block_mode: bool | None = None, ) → ModelRunner[source]#

从引擎目录创建一个 ModelRunner 实例。

参数:

engine_dir (str) – 包含序列化引擎文件和配置文件目录。
max_output_len (Optional[int]) – 最大输出长度，此参数可能仅在加载时可用，当禁用 KV 缓存时，生成仍会进行检查。
lora_dir (Optional[List[str]]) – 包含 LoRA 权重的目录。
rank (int) – 运行时进程ID。
debug_mode (bool) – 是否开启调试模式。
medusa_choices (List[List[int]]) – 在 Medusa 解码时使用的 Medusa 选项。
stream (torch.cuda.Stream) – 使用的流。
multi_block_mode (bool) – 是否在 GPU 上将工作分配给多个 CUDA 线程块以进行掩码 MHA 内核计算。

返回:

ModelRunner 的一个实例。

返回类型:

ModelRunner

classmethod from_engine( engine: Engine, *, max_output_len: int | None, lora_dir: List[str] | None, rank: int, debug_mode: bool, lora_ckpt_source: str, medusa_choices: List[List[int]], stream: Stream, gpu_weights_percent: float, enable_context_fmha_fp32_acc: bool | None, multi_block_mode: bool | None, ) → ModelRunner[source]#

属性 gather_context_logits: bool#

属性 gather_generation_logits: bool#

generate(

batch_input_ids: List[Tensor],

position_ids: List[Tensor] = None,

sampling_config: SamplingConfig | None = None,

prompt_table: str | Tensor | None = None,

prompt_tasks: str | None = None,

lora_uids: list | None = None,

streaming: bool = False,

output_generation_logits: bool = False,

stopping_criteria: StoppingCriteria | None = None,

logits_processor: LogitsProcessor | None = None,

medusa_choices: List[List[int]] | None = None,

encoder_max_input_length: int = None,

encoder_input_features: List[Tensor] = None,

encoder_output_lengths: List[Tensor] = None,

cross_attention_masks: List[Tensor] = None,

**kwargs,

) → Tensor | dict[source]#

生成 token ID 序列。生成控制参数在 sampling_config 中设置；如果未传入，则会设置为默认值。您可以通过传入相应的参数来覆盖 sampling_config 的任何属性。

参数:

batch_input_ids (List[torch.Tensor]) – 输入 ID 张量的列表。每个张量的形状为 (sequence_length, )。
sampling_config (SamplingConfig) – 用作生成调用基础参数设置的采样配置。传入的 **kwargs（与 sampling_config 的属性匹配）将覆盖这些属性。如果未提供 sampling_config，将使用默认值。
prompt_table (str 或 torch.Tensor) – 提示表的文件路径（.npy 格式，由 nemo_prompt_convert.py 导出）或提示表本身。
prompt_tasks (str) – 输入批次的提示调优任务 ID，格式为逗号分隔列表（例如，0,3,1,0）。
lora_uids (list) – 输入批次的 LoRA 权重 UID。使用 -1 禁用 LoRA 模块。
streaming (bool) – 生成是否使用流式模式。
stopping_criteria (StoppingCriteria) – 自定义停止条件。
logits_processor (LogitsProcessor) – 自定义 logits 处理器。
medusa_choices (List[List[int]]) – Medusa 解码选项。
(Dict[str (kwargs) – sampling_config 的临时参数设置。传入的 **kwargs（与 sampling_config 的属性匹配）将覆盖这些属性。
Any] – sampling_config 的临时参数设置。传入的 **kwargs（与 sampling_config 的属性匹配）将覆盖这些属性。

返回:

如果 return_dict=False，方法返回生成的 output_ids。如果 return_dict=True，方法返回一个字典，包含 output_ids、sequence_lengths（如果 sampling_config.output_sequence_lengths=True）、context_logits 和 generation_logits（分别在 self.gather_context_logits=True 和 self.gather_generation_logits=True 时）。

返回类型:

torch.Tensor 或 dict

属性 hidden_size: int#

属性 mapping: Mapping#

属性 max_prompt_embedding_table_size: int#

属性 max_sequence_length: int#

属性 num_heads: int#

属性 num_layers: int#

属性 remove_input_padding: bool#

serialize_engine() → IHostMemory[source]#

序列化引擎。

返回:: 序列化后的引擎。
返回类型:: bytes

属性 use_lora_plugin: bool#

属性 vocab_size: int#

属性 vocab_size_padded: int#

类 tensorrt_llm.runtime.ModelRunnerCpp( executor: Executor, max_batch_size: int, max_input_len: int, max_seq_len: int, max_beam_width: int, model_config: ModelConfig, world_config: WorldConfig, use_kv_cache: bool, lora_manager: LoraManager | None = None, )[source]#

基类: ModelRunnerMixin

一个接口类，它封装了 Executor 并提供生成方法。

属性 dtype: dtype#

类方法 from_dir( engine_dir: str, *, lora_dir: str | None = None, rank: int = 0, max_batch_size: int | None = None, max_input_len: int | None = None, max_output_len: int | None = None, max_beam_width: int | None = None, max_attention_window_size: list[int] | None = None, sink_token_length: int | None = None, kv_cache_free_gpu_memory_fraction: float | None = None, cross_kv_cache_fraction: float | None = None, medusa_choices: list[list[int]] | None = None, eagle_choices: list[list[int]] | None = None, eagle_posterior_threshold: float | None = None, eagle_use_dynamic_tree: bool = False, eagle_dynamic_tree_max_top_k: int | None = None, lookahead_config: list[int] | None = None, debug_mode: bool = False, lora_ckpt_source: str = 'hf', use_gpu_direct_storage: bool = False, gpu_weights_percent: float = 1, max_tokens_in_paged_kv_cache: int | None = None, kv_cache_enable_block_reuse: bool = False, enable_chunked_context: bool = False, is_enc_dec: bool = False, multi_block_mode: bool = True, enable_context_fmha_fp32_acc: bool | None = None, cuda_graph_mode: bool | None = None, logits_processor_map: Dict[str, LogitsProcessor] | None = None, device_ids: List[int] | None = None, is_orchestrator_mode: bool = False, use_runtime_defaults: bool = True, gather_generation_logits: bool = False, use_variable_beam_width_search: bool = False, mm_embedding_offloading: bool = False, ) → ModelRunnerCpp[source]#

从引擎目录创建一个 ModelRunnerCpp 实例。

参数:

engine_dir (str) – 包含序列化引擎文件和配置文件目录。
lora_dir (str) – 包含 LoRA 权重的目录。
rank (int) – 运行时进程ID。
max_batch_size (int) – 运行时批量大小限制。如果 max_batch_size 不是 None，它不应大于引擎的 max_batch_size；否则将使用引擎的 max_batch_size。
max_input_len (int) – 运行时输入长度限制。如果 max_input_len 不是 None，它不应大于引擎的 max_input_len；否则将使用引擎的 max_input_len。
max_output_len (int) – 运行时输出长度限制。如果 max_output_len 不是 None，它不应大于引擎的 max_output_len；否则将使用引擎的 max_output_len。
max_beam_width (int) – 运行时 beam width 限制。如果 max_beam_width 不是 None，它不应大于引擎的 max_beam_width；否则将使用引擎的 max_beam_width。
max_attention_window_size (List[int]) – 控制滑动窗口注意力 / 循环 KV 缓存行为的注意力窗口大小。
sink_token_length (int) – sink token 长度，默认为 0。
kv_cache_free_gpu_memory_fraction (float) – KV 缓存使用的空闲 GPU 内存比例。
cross_kv_cache_fraction (float) – 为交叉注意力保留的 KV 缓存比例，仅应与 enc-dec 模型一起使用。
debug_mode (bool) – 是否开启调试模式。
medusa_choices (List[List[int]]) – 在 Medusa 解码中使用的 Medusa 选项。
eagle_choices (List[List[int]]) – 在 Eagle-1 解码中使用的 Eagle 选项。
float (eagle_posterior_threshold) – 典型接受的最小 token 概率阈值。非 None 值将在 Eagle 中启用典型接受。
bool (eagle_use_dynamic_tree) – 是否使用 Eagle-2 (动态树)。
int (eagle_dynamic_tree_max_top_k) – 在 Eagle-2 中每个节点展开的草稿 token 的最大数量。
lora_ckpt_source (str) – checkpoint 的来源。应为 ['hf', 'nemo'] 之一。
max_tokens_in_paged_kv_cache (int) – 在 KV 缓存中配置的最大 token 数量。
kv_cache_enable_block_reuse (bool) – 在 KV 缓存中启用块重用。
enable_chunked_context (bool) – 启用分块上下文。
is_enc_dec (bool) – 模型是否为编码器-解码器架构。
multi_block_mode (bool) – 是否在 GPU 上将工作分配给多个 CUDA 线程块以进行掩码 MHA 内核计算。
enable_context_fmha_fp32_acc (bool) – 启用 FMHA 运行器的 FP32 累加。
cuda_graph_mode (bool) – 推理时是否使用 cuda graph。
logits_processor_map (Dict[str, LogitsProcessor]) – 一个 logits processor 函数映射，按名称索引。稍后可以将名称提供给 generate() 函数以指定要运行的 logits processor。
device_ids (List[int]) – 用于运行 Executor 的设备索引。
is_orchestrator_mode (bool) – 运行 model-runner 的模式，默认为 Leader 模式。
gather_generation_logits (bool) – 启用收集生成的 logits。

返回:

ModelRunnerCpp 的实例。

返回类型:

ModelRunnerCpp

属性 gather_context_logits: bool#

属性 gather_generation_logits: bool#

generate(

batch_input_ids: List[Tensor],

*,

position_ids: List[Tensor] = None,

encoder_input_ids: List[Tensor] = None,

encoder_input_features: List[Tensor] = None,

encoder_output_lengths: List[int] = None,

cross_attention_masks: List[Tensor] = None,

mrope_params: MropeParams | None = None,

sampling_config: SamplingConfig | None = None,

lora_uids: list | None = None,

lookahead_config: list[int] | None = None,

streaming: bool = False,

stopping_criteria: StoppingCriteria | None = None,

logits_processor_names: list[str] | None = None,

max_new_tokens: int = 1,

end_id: int | None = None,

pad_id: int | None = None,

bad_words_list: list[list[int]] | None = None,

stop_words_list: list[list[int]] | None = None,

return_dict: bool = False,

output_sequence_lengths: bool = False,

output_generation_logits: bool = False,

output_log_probs: bool = False,

output_cum_log_probs: bool = False,

prompt_table: str | Tensor | None = None,

prompt_tasks: str | None = None,

input_token_extra_ids: List[List[int]] = None,

return_all_generated_tokens: bool = False,

language_adapter_uids: List[int] | None = None,

mm_embedding_offloading: bool = False,

**kwargs,

) → Tensor | dict[source]#

参数:

batch_input_ids (List[torch.Tensor]) – 输入 ID 张量的列表。每个张量的形状为 (sequence_length, )。
position_ids (List[torch.Tensor]) – position id 张量的列表。每个张量的形状为 (sequence_length, )。
encoder_input_ids (List[torch.Tensor]) – 编码器-解码器模型的编码器输入 id 张量列表（可选）。每个张量的形状为 (sequence_length, )。
encoder_input_features – (List[torch.Tensor]): 多模态编码器-解码器模型的编码器输入特征张量列表（可选）。每个张量的形状为 (sequence_length, feature_dim)。
encoder_output_lengths – (List[int]): 编码器输出长度列表（可选），如果编码器输出长度与编码器输入长度不同（由于卷积下采样等原因）。
sampling_config (SamplingConfig) – 用作生成调用的基本参数化的采样配置。传递的与 sampling_config 属性匹配的 **kwargs 将覆盖它们。如果未提供 sampling_config，将使用默认配置。
prompt_table (str 或 torch.Tensor) – 提示表的文件路径（.npy 格式，由 nemo_prompt_convert.py 导出）或提示表本身。
prompt_tasks (str) – 输入批次的提示调优任务 ID，格式为逗号分隔列表（例如，0,3,1,0）。
input_token_extra_ids (List[List[int]]) – 用于 p-tuning 和 KV 缓存重用一起使用的输入 token 额外 id。
lora_uids (list) – 输入批次的 LoRA 权重 UID。使用 -1 禁用 LoRA 模块。
streaming (bool) – 生成是否使用流式模式。
stopping_criteria (StoppingCriteria) – 自定义停止条件。
logits_processor_names (List[str]) – 自定义 logits processor 名称。
return_all_generated_tokens (bool) – 在每个流式步骤中是否返回完整输出。
(Dict[str (kwargs) – sampling_config 的临时参数化。传递的与 sampling_config 属性匹配的 **kwargs 将覆盖它们。
Any] – sampling_config 的临时参数化。传递的与 sampling_config 属性匹配的 **kwargs 将覆盖它们。

返回:

返回类型:

torch.Tensor 或 dict

属性 hidden_size: int#

属性 max_prompt_embedding_table_size: int#

属性 max_sequence_length: int#

属性 num_heads: int#

属性 num_layers: int#

属性 remove_input_padding: bool#

属性 vocab_size: int#

属性 vocab_size_padded: int#

类 tensorrt_llm.runtime.MultimodalModelRunner(args)[source]#

Bases: object

属性 audio_engine_dir#

属性 cpp_e2e#

属性 cpp_llm_only#

generate( pre_prompt, post_prompt, image, decoder_input_ids, max_new_tokens, other_vision_inputs={}, other_audio_inputs={}, other_decoder_inputs={}, )[source]#

get_audio_features(audio, other_audio_inputs)[source]#

get_rope_index( input_ids: LongTensor, image_grid_thw: LongTensor | None = None, video_grid_thw: LongTensor | None = None, attention_mask: Tensor | None = None, ) → Tuple[Tensor, Tensor][source]#

根据图像和视频在 LLM 中的时间、高度和宽度计算 3D rope 索引。

解释

每个嵌入序列包含视觉嵌入和文本嵌入，或者仅包含文本嵌入。

对于纯文本嵌入序列，旋转位置嵌入与现代 LLM 没有区别。示例

input_ids: [T T T T T]，其中 T 代表文本。时间 position_ids: [0, 1, 2, 3, 4] 高度 position_ids: [0, 1, 2, 3, 4] 宽度 position_ids: [0, 1, 2, 3, 4]

对于视觉和文本嵌入序列，我们为视觉部分计算 3D 旋转位置嵌入，为文本部分计算 1D 旋转位置嵌入。示例

假设我们有一个视频输入，包含 3 个时间补丁、2 个高度补丁和 2 个宽度补丁。input_ids: [V V V V V V V V V V V V T T T T T]，其中 V 代表视觉。视觉时间 position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] 视觉高度 position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] 视觉宽度 position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] 文本时间 position_ids: [3, 4, 5, 6, 7] 文本高度 position_ids: [3, 4, 5, 6, 7] 文本宽度 position_ids: [3, 4, 5, 6, 7] 这里我们将文本开始 position_ids 计算为最大视觉 position_ids 加 1。

参数:

input_ids (torch.LongTensor of shape (batch_size, sequence_length)) – 输入序列 token 在词汇表中的索引。如果提供填充，默认情况下将忽略填充。
image_grid_thw (torch.LongTensor of shape (num_images, 3)，可选) – LLM 中每张图像的特征形状的时间、高度和宽度。
video_grid_thw (torch.LongTensor of shape (num_videos, 3)，可选) – LLM 中每个视频的特征形状的时间、高度和宽度。
attention_mask (torch.Tensor of shape (batch_size, sequence_length)，可选) –
用于避免在填充 token 索引上执行注意力的掩码。掩码值选择范围为 [0, 1]
- 对于未被掩码的 token 为 1，
- 对于被掩码的 token 为 0。

返回:

position_ids (torch.LongTensor of shape (3, batch_size, sequence_length)) mrope_position_deltas (torch.Tensor of shape (batch_size))

get_visual_features(image, other_vision_inputs)[source]#

init_audio_encoder()[source]#

init_image_encoder()[source]#

init_llm()[source]#

init_processor()[source]#

init_tokenizer()[source]#

属性 llm_engine_dir#

load_test_audio(audio_path)[source]#

load_test_data(image_path=None, video_path=None)[source]#

prepare_position_ids_for_cogvlm(input_ids)[source]#

preprocess( pre_prompt, post_prompt, image, other_vision_inputs, other_audio_inputs, )[source]#

ptuning_setup( prompt_table, input_ids, input_lengths, )[source]#

ptuning_setup_fuyu( input_ids, image_patches_indices, )[source]#

ptuning_setup_llava_next( visual_features, pre_prompt, post_prompt, )[source]#

ptuning_setup_phi3( visual_features, audio_features, input_ids, num_img_tokens, num_aud_tokens, )[source]#

属性 python_e2e#

run( input_text, input_image, input_audio, max_new_tokens, )[source]#

setup_fake_prompts( visual_features, pre_input_ids, post_input_ids, input_lengths, )[source]#

setup_fake_prompts_qwen2vl( visual_features, input_ids, vision_grid_thws, attention_mask, input_lengths, )[source]#

setup_fake_prompts_vila( batch_size, visual_features, split_input_ids, input_lengths, )[source]#

setup_inputs( input_text, raw_image, raw_audio=None, )[source]#

split_prompt_by_images(tensor)[source]#

静态方法 tokenizer_image_token( batch_size, pre_prompt, post_prompt, tokenizer, image_token_index=-200, )[source]#

video_preprocess(video_path)[source]#

属性 visual_engine_dir#

类 tensorrt_llm.runtime.QWenForCausalLMGenerationSession( model_config: ModelConfig, engine_buffer, mapping: Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream: Stream = None, global_max_input_length: int = 2048, global_max_output_length: int = 4096, )[source]#

Bases: GenerationSession

generate( input_ids: Tensor, input_lengths: Tensor, sampling_config: SamplingConfig, max_new_tokens: int, runtime_rank: int = 0, )[source]#

类 tensorrt_llm.runtime.SamplingConfig( end_id: int, pad_id: int, max_new_tokens: int = 20, num_beams: int = 1, num_return_sequences: int | None = None, max_attention_window_size: int | None = None, sink_token_length: int | None = None, output_sequence_lengths: bool = False, return_dict: bool = False, stop_words_list: list | numpy.ndarray | torch.Tensor | NoneType = None, bad_words_list: list | numpy.ndarray | torch.Tensor | NoneType = None, temperature: float | torch.Tensor = 1.0, top_k: int | torch.Tensor = 1, top_p: float | torch.Tensor = 0.0, top_p_decay: torch.Tensor | None = None, top_p_min: torch.Tensor | None = None, top_p_reset_ids: torch.Tensor | None = None, random_seed: int | torch.Tensor = None, length_penalty: float | torch.Tensor = 1.0, early_stopping: int | torch.Tensor = 1, repetition_penalty: float | torch.Tensor = 1.0, min_length: int | torch.Tensor = 1, presence_penalty: float | torch.Tensor = 0.0, frequency_penalty: float | torch.Tensor = 0.0, use_beam_hyps: bool = True, min_p: float | torch.Tensor = 0.0, )[source]#

Bases: object

bad_words_list: list | ndarray | Tensor | None = None#

beam_search_diversity_rate: float | Tensor = 0.0#

early_stopping: int | Tensor = 1#

end_id: int#

frequency_penalty: float | Tensor = 0.0#

length_penalty: float | Tensor = 1.0#

max_attention_window_size: int | None = None#

max_new_tokens: int = 20#

min_length: int | Tensor = 1#

min_p: float | Tensor = 0.0#

no_repeat_ngram_size: int | Tensor = None#

num_beams: int = 1#

num_return_sequences: int | None = None#

output_cum_log_probs: bool = False#

output_log_probs: bool = False#

output_sequence_lengths: bool = False#

pad_id: int#

presence_penalty: float | Tensor = 0.0#

random_seed: int | Tensor = None#

repetition_penalty: float | Tensor = 1.0#

return_dict: bool = False#

sink_token_length: int | None = None#

stop_words_list: list | ndarray | Tensor | None = None#

temperature: float | Tensor = 1.0#

top_k: int | Tensor = 1#

top_p: float | Tensor = 0.0#

top_p_decay: Tensor | None = None#

top_p_min: Tensor | None = None#

top_p_reset_ids: Tensor | None = None#

update(**kwargs)[source]#

use_beam_hyps: bool = True#

class tensorrt_llm.runtime.Session(**kwargs)[source]#

Bases: object

会话是一个托管的 TensorRT 运行时。

property context: IExecutionContext#

获取默认的 TensorRT 执行上下文，: 如果需要创建新的上下文，请使用 self.engine.create_execution_context()

@return: 一个 TensorRT 执行上下文对象

类型：: @brief

property context_mem_size: int#

property engine: ICudaEngine#

static from_engine(engine) → Session[source]#: @brief: 从现有的 ICudaEngine 引擎创建会话 @param engine: 一个 ICudaEngine @return: 一个 Session 对象

static from_serialized_engine( engine, ) → Session[source]#: @brief: 从序列化引擎创建会话 @param engine: 一个序列化引擎 @return: 一个 Session 对象

infer_shapes( inputs: List[TensorInfo], context: IExecutionContext | None = None, ) → List[TensorInfo][source]#

@brief: 为给定上下文设置输入形状，并从给定的输入形状推断输出形状。: 在调用 run() 之前，每当输入形状发生变化时都应调用此函数。或者手动对所有动态形状的输入张量调用 context.set_input_shape。

@param inputs: TensorInfo 对象的列表，每个项表示一个输入张量 @param context: TensorRT 执行上下文，如果为 None，则使用默认上下文 @return: TensorInfo 对象的列表，每个项表示一个输出张量，如果失败则返回 None

run( inputs: Dict[str, Any], outputs: Dict[str, Any], stream, context=None, ) → bool[source]#: @brief: 使用给定的输入和输出运行 TensorRT 引擎 @param inputs: 输入张量的字典，key 是张量名称，value 是张量指针或 torch 张量 @param outputs: 输出张量的字典，key 是张量名称，value 是张量指针或 torch 张量 @param stream: 用于将 TensorRT 引擎排队的 cuda 流 @param context: TensorRT 执行上下文，如果为 None，则使用默认上下文 @return: 如果排队成功则为 True，请注意排队是一个异步调用，

返回 True 不表示执行已完成

property runtime: Runtime#

set_shapes( tensor_dict: Dict[str, Tensor], context: IExecutionContext | None = None, )[source]#

class tensorrt_llm.runtime.StoppingCriteria[source]#

Bases: object

所有可在生成期间应用的停止标准的基类。

class tensorrt_llm.runtime.StoppingCriteriaList(iterable=(), /)[source]#: Bases: list, StoppingCriteria

class tensorrt_llm.runtime.TensorInfo(name: 'str', dtype: 'trt.DataType', shape: 'tuple')[source]#

Bases: object

dtype: DataType#

name: str#

numel()[source]#

shape: tuple#

squeeze(dim=0)[source]#

view(*shape)[source]#

tensorrt_llm.runtime.decode_words_list( word_dict: List[List[str]], tokenizer=None, add_special_tokens=False, )[source]#

word_dict 的格式: len(word_dict) 应与 batch_size 相同 word_dict[i] 表示第 i 个批次的单词 len(word_dict[i]) >= 1，这意味着它必须至少包含 1 个字符串例如，word_dict[2] = [” I am happy”, “ I am sad”]。