运行时#

class tensorrt_llm.runtime.ChatGLMGenerationSession(
model_config: ModelConfig,
engine_buffer,
mapping: Mapping,
debug_mode=False,
debug_tensors_to_save=None,
cuda_graph_mode=False,
stream: Stream = None,
)[source]#

Bases: GenerationSession

class tensorrt_llm.runtime.EncDecModelRunner(
engine_name,
engine_dir,
lora_dir=None,
lora_task_uids=None,
debug_mode=False,
skip_encoder=False,
stream: Stream = None,
enable_context_fmha_fp32_acc: bool = None,
)[source]#

Bases: object

encoder_run(
input_ids,
input_lengths,
max_input_length,
position_ids=None,
token_type_ids=None,
debug_mode=False,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
attention_mask=None,
language_adapter_routings=None,
)[source]#
classmethod from_engine(
engine_name,
engine_dir,
lora_dir=None,
lora_task_uids=None,
debug_mode=False,
skip_encoder=False,
stream=None,
enable_context_fmha_fp32_acc=None,
)[source]#
generate(
encoder_input_ids,
decoder_input_ids,
max_new_tokens,
num_beams=1,
pad_token_id=None,
eos_token_id=None,
bos_token_id=None,
debug_mode=False,
return_dict=False,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
attention_mask=None,
time_encoder=False,
return_encoder_output=False,
encoder_language_adapter_routings=None,
decoder_language_adapter_routings=None,
)[source]#
process_input(
input_ids,
remove_input_padding=False,
pad_token_id=0,
prompt_tasks=None,
language_adapter_routings=None,
)[source]#
class tensorrt_llm.runtime.GenerationSequence(seq_idx, batch_idx)[source]#

Bases: object

get_batch_idx() int[source]#

返回批次中的序列索引

get_seq_idx() int[source]#

返回序列索引

class tensorrt_llm.runtime.GenerationSession(
model_config: ModelConfig,
engine_buffer,
mapping: Mapping,
debug_mode=False,
debug_tensors_to_save=None,
cuda_graph_mode=False,
stream: Stream = None,
)[source]#

Bases: object

batch_size: int#
buffer_allocated: bool#
property context_mem_size: int#
property conv_kernel#
property cross_attention#
cuda_graph_mode: bool#
cuda_stream_guard()[source]#

同步外部流并将当前流设置为与会话绑定的流。退出时重置。

debug_mode: bool#
debug_tensors_to_save: None#
decode(
input_ids: Tensor,
context_lengths: Tensor,
sampling_config: SamplingConfig,
prompt_embedding_table: Tensor = None,
tasks: Tensor = None,
prompt_vocab_size: Tensor = None,
stop_words_list=None,
bad_words_list=None,
streaming: bool = False,
output_sequence_lengths: bool = False,
output_generation_logits: bool = False,
return_dict: bool = False,
encoder_output: Tensor = None,
encoder_input_lengths: Tensor = None,
stopping_criteria: StoppingCriteria = None,
logits_processor: LogitsProcessor = None,
cross_attention_mask: List[Tensor] = None,
**kwargs,
)[source]#
decode_batch(
input_ids: Sequence[Tensor],
sampling_config: SamplingConfig,
streaming: bool = False,
**kwargs,
)[source]#
decode_regular(
*,
batch_size: int,
scfg: SamplingConfig,
sequence_lengths: Tensor,
context_lengths: Tensor,
host_context_lengths,
max_context_length: int,
beam_width: int,
cache_indirections: list,
input_ids: Tensor,
hidden_states: Tensor,
prompt_embedding_table: Tensor,
tasks: Tensor,
prompt_vocab_size: Tensor,
ite: int,
sequence_limit_lengths: Tensor,
stop_words_data,
bad_words_data,
output_sequence_lengths: bool = False,
output_generation_logits: bool = False,
return_dict: bool = False,
encoder_output: Tensor = None,
encoder_input_lengths: Tensor = None,
stopping_criteria: StoppingCriteria = None,
logits_processor: LogitsProcessor = None,
cross_attention_mask: List[Tensor] = None,
**kwargs,
)[source]#
decode_stream(
*,
batch_size: int,
scfg: SamplingConfig,
sequence_lengths: Tensor,
context_lengths: Tensor,
host_context_lengths,
max_context_length: int,
beam_width: int,
cache_indirections: list,
input_ids: Tensor,
hidden_states: Tensor,
prompt_embedding_table: Tensor,
tasks: Tensor,
prompt_vocab_size: Tensor,
ite: int,
sequence_limit_lengths: Tensor,
stop_words_data,
bad_words_data,
output_sequence_lengths: bool = False,
output_generation_logits: bool = False,
return_dict: bool = False,
encoder_output: Tensor = None,
encoder_input_lengths: Tensor = None,
stopping_criteria: StoppingCriteria = None,
logits_processor: LogitsProcessor = None,
cross_attention_mask: List[Tensor] = None,
**kwargs,
)[source]#
device: device#
property dtype#
dump_debug_buffers(step: int) None[source]#
early_stop_criteria(batch_size, step, should_stop)[source]#
property engine_inspector#
filter_medusa_logits(
batch_size,
best_path,
best_path_lengths,
medusa_logits,
)[source]#

medusa_logits 的形状为 [nMH, bs, nMT+1, vocab]

返回 [nMH, bs, vocab]

finalize_decoder(
context_lengths,
batch_size,
beam_width,
scfg,
in_progress=False,
)[source]#
find_best_medusa_path(
batch_size,
input_ids: Tensor,
next_logits,
temp=0,
)[source]#
property first_layer#
property gather_context_logits#
property gather_generation_logits#
property gemm_allreduce_plugin#
get_next_medusa_tokens(
batch_size,
next_medusa_logits,
)[source]#
get_num_heads_kv(
layer_idx: int | None = None,
) int[source]#
handle_per_step(
*,
cache_indirections: list,
step: int,
batch_size: int,
max_context_length: int,
beam_width: int,
input_ids: Tensor,
hidden_states: Tensor,
scfg: SamplingConfig,
kv_cache_block_offsets: Tensor,
host_kv_cache_block_offsets: Tensor,
cross_kv_cache_block_offsets: Tensor,
host_cross_kv_cache_block_offsets: Tensor,
prompt_embedding_table: Tensor,
tasks: Tensor,
context_lengths: Tensor,
host_context_lengths,
attention_mask: Tensor,
cross_attention_mask_for_context: Tensor,
cross_attention_mask_for_gen: Tensor,
prompt_vocab_size: Tensor,
ite: int,
sequence_limit_lengths: Tensor,
sequence_lengths: Tensor,
next_step_tensors: Dict[str, RuntimeTensor],
stop_words_data,
bad_words_data,
encoder_output: Tensor,
encoder_input_lengths: Tensor,
stopping_criteria: StoppingCriteria,
logits_processor: LogitsProcessor,
output_generation_logits: bool,
**kwargs,
)[source]#
property has_position_embedding#
property has_token_type_embedding#
property head_size#
property hidden_size#
property is_medusa_mode#
property is_redrafter_mode#
property kv_cache_type#
property last_layer#
locate_accepted_draft_tokens(
batch_size,
best_path,
best_path_len,
draft_paths,
)[source]#
mapping: Mapping#
property max_draft_tokens#
property max_prompt_embedding_table_size#
medusa_decode_and_verify(step, batch_size, logits)[source]#
medusa_paths: List[List[int]] = None#
medusa_position_offsets: List[int] = None#
medusa_temperature: float = 0.0#
medusa_topks: List[int] = None#
medusa_tree_ids: List[int] = None#
next_medusa_input_ids()[source]#
num_draft_tokens: int = 0#
property num_heads#
property num_layers#
property num_medusa_heads#
property paged_kv_cache#
property paged_state#
pp_communicate_final_output_ids(
final_output_ids,
batch_size,
beam_width,
)[source]#
pp_communicate_new_tokens(
should_stop,
cache_indir,
sequence_length,
)[source]#
process_logits_including_draft(
step,
batch_size,
logits,
next_step_buffer,
)[source]#
  1. 将 logits 处理成 token 并验证 (Medusa) 或处理输出 (ReDrafter)

  2. 在此处提取提前停止标准:self.accept_length

  3. 更新输出 id:需要 self.new_tokens 和 past_sequence_length

  4. 获取下一个 input_ids:self.[new_tokens, accept_lengths, medusa_output_tokens]

  5. 更新 KV cache:self.[sequence_length, num_draft_tokens]

  6. 更新 sequence_length_buffer 和 past_kv_length

property profiler#
property quant_mode#
property remove_input_padding#
property rnn_conv_dim_size#
property rnn_head_size#
property rnn_hidden_size#
runtime: _Runtime#
setup(
batch_size: int,
max_context_length: int,
max_new_tokens: int,
beam_width: int = 1,
max_attention_window_size: int | None = None,
sink_token_length: int | None = None,
encoder_max_input_length: int | None = None,
lora_manager: LoraManager = None,
lora_uids: List[str] = None,
medusa_choices: List[List[int]] = None,
multi_block_mode: bool = True,
enable_context_fmha_fp32_acc: bool = None,
)[source]#
property state_dtype#
property state_size#
property tokens_per_block#
update_output_ids_by_offset(
new_generated_ids,
offsets,
)[source]#
property use_gemm_allreduce_plugin#
property use_gpt_attention_plugin#
property use_kv_cache#
property use_lora_plugin#
property use_mamba_conv1d_plugin#
property vocab_size#
class tensorrt_llm.runtime.KVCacheManager(
*,
num_layers: int,
num_blocks: int,
block_size: int,
tokens_per_block: int,
max_blocks_per_seq: int,
max_attention_window_size: int,
sink_token_len: int,
beam_width: int = 1,
use_one_more_block: bool = False,
)[source]#

Bases: object

add_sequence(
sequence: GenerationSequence,
context_len: int,
always_share_across_beam: bool = False,
)[source]#

将序列添加到管理器并为上下文分配最少量的块

get_block_offsets(beam_width: int) Tensor[source]#

返回内存池中的偏移量数组

step(finished: List[bool])[source]#

迭代到下一个生成步骤。在需要的地方添加新的块并清除已完成的序列。

class tensorrt_llm.runtime.LogitsProcessor[source]#

Bases: object

所有可在生成过程中应用的 logits 处理器的基类。

class tensorrt_llm.runtime.LogitsProcessorList(iterable=(), /)[source]#

基类:list, LogitsProcessor

class tensorrt_llm.runtime.ModelConfig(
max_batch_size: int,
max_beam_width: int,
vocab_size: int,
num_layers: int,
num_heads: int,
num_kv_heads: int,
hidden_size: int,
gpt_attention_plugin: bool,
gemm_allreduce_plugin: str = None,
remove_input_padding: bool = False,
model_name: str = '',
kv_cache_type: tensorrt_llm.bindings.KVCacheType = <KVCacheType.CONTINUOUS: 0>,
cross_attention: bool = False,
head_size: int = None,
has_position_embedding: bool = True,
has_token_type_embedding: bool = False,
tokens_per_block: int = 32,
max_prompt_embedding_table_size: int = 0,
quant_mode: tensorrt_llm.quantization.mode.QuantMode = <QuantMode: 0>,
gather_context_logits: bool = False,
gather_generation_logits: bool = False,
dtype: str = '',
lora_plugin: bool = False,
lora_target_modules: List[str] = <factory>,
trtllm_modules_to_hf_modules: dict = None,
skip_cross_kv: bool = False,
num_medusa_heads: int = 0,
max_medusa_tokens: int = 0,
paged_state: bool = True,
mamba_conv1d_plugin: bool = True,
conv_kernel: int = 0,
layer_types: List[str] = <factory>,
rnn_hidden_size: int = 0,
rnn_head_size: int = 0,
rnn_conv_dim_size: int = 0,
state_size: int = 0,
state_dtype: str = '',
gpu_weights_percent: float = 1.0,
redrafter_num_beams: int = 0,
redrafter_draft_len_per_beam: int = 0,
num_kv_heads_per_layer: Optional[List[int]] = None,
num_kv_heads_per_cross_attn_layer: Optional[List[int]] = None,
skip_cross_attn_blocks: bool = False,
language_adapter_config: Optional[tensorrt_llm.layers.language_adapter.LanguageAdapterConfig] = None,
)[source]#

Bases: object

conv_kernel: int = 0#
cross_attention: bool = False#
dtype: str = ''#
gather_context_logits: bool = False#
gather_generation_logits: bool = False#
gemm_allreduce_plugin: str = None#
gpt_attention_plugin: bool#
gpu_weights_percent: float = 1.0#
has_position_embedding: bool = True#
has_token_type_embedding: bool = False#
head_size: int = None#
hidden_size: int#
kv_cache_type: KVCacheType = <KVCacheType.CONTINUOUS: 0>#
language_adapter_config: LanguageAdapterConfig | None = None#
layer_types: List[str]#
lora_plugin: bool = False#
lora_target_modules: List[str]#
mamba_conv1d_plugin: bool = True#
max_batch_size: int#
max_beam_width: int#
max_medusa_tokens: int = 0#
max_prompt_embedding_table_size: int = 0#
model_name: str = ''#
num_heads: int#
num_kv_heads: int#
num_kv_heads_per_cross_attn_layer: List[int] | None = None#
num_kv_heads_per_layer: List[int] | None = None#
num_layers: int#
num_medusa_heads: int = 0#
paged_state: bool = True#
quant_mode: QuantMode = 0#
redrafter_draft_len_per_beam: int = 0#
redrafter_num_beams: int = 0#
remove_input_padding: bool = False#
rnn_conv_dim_size: int = 0#
rnn_head_size: int = 0#
rnn_hidden_size: int = 0#
skip_cross_attn_blocks: bool = False#
skip_cross_kv: bool = False#
state_dtype: str = ''#
state_size: int = 0#
tokens_per_block: int = 32#
trtllm_modules_to_hf_modules: dict = None#
vocab_size: int#
tensorrt_llm.runtime.ModelRunner(
session: GenerationSession,
max_batch_size: int,
max_input_len: int,
max_seq_len: int,
max_beam_width: int,
kv_cache_type: KVCacheType,
lora_manager: LoraManager | None = None,
)[source]#

基类: ModelRunnerMixin

一个接口类,它封装了 GenerationSession 并提供生成方法。

属性 dtype: dtype#
类方法 from_dir(
engine_dir: str,
*,
max_output_len: int | None = None,
lora_dir: List[str] | None = None,
rank: int = 0,
debug_mode: bool = False,
lora_ckpt_source: str = 'hf',
medusa_choices: List[List[int]] = None,
stream: Stream = None,
gpu_weights_percent: float = 1,
enable_context_fmha_fp32_acc: bool | None = None,
multi_block_mode: bool | None = None,
) ModelRunner[source]#

从引擎目录创建一个 ModelRunner 实例。

参数:
  • engine_dir (str) – 包含序列化引擎文件和配置文件目录。

  • max_output_len (Optional[int]) – 最大输出长度,此参数可能仅在加载时可用,当禁用 KV 缓存时,生成仍会进行检查。

  • lora_dir (Optional[List[str]]) – 包含 LoRA 权重的目录。

  • rank (int) – 运行时进程ID。

  • debug_mode (bool) – 是否开启调试模式。

  • medusa_choices (List[List[int]]) – 在 Medusa 解码时使用的 Medusa 选项。

  • stream (torch.cuda.Stream) – 使用的流。

  • multi_block_mode (bool) – 是否在 GPU 上将工作分配给多个 CUDA 线程块以进行掩码 MHA 内核计算。

返回:

ModelRunner 的一个实例。

返回类型:

ModelRunner

classmethod from_engine(
engine: Engine,
*,
max_output_len: int | None,
lora_dir: List[str] | None,
rank: int,
debug_mode: bool,
lora_ckpt_source: str,
medusa_choices: List[List[int]],
stream: Stream,
gpu_weights_percent: float,
enable_context_fmha_fp32_acc: bool | None,
multi_block_mode: bool | None,
) ModelRunner[source]#
属性 gather_context_logits: bool#
属性 gather_generation_logits: bool#
generate(
batch_input_ids: List[Tensor],
position_ids: List[Tensor] = None,
sampling_config: SamplingConfig | None = None,
prompt_table: str | Tensor | None = None,
prompt_tasks: str | None = None,
lora_uids: list | None = None,
streaming: bool = False,
output_generation_logits: bool = False,
stopping_criteria: StoppingCriteria | None = None,
logits_processor: LogitsProcessor | None = None,
medusa_choices: List[List[int]] | None = None,
encoder_max_input_length: int = None,
encoder_input_features: List[Tensor] = None,
encoder_output_lengths: List[Tensor] = None,
cross_attention_masks: List[Tensor] = None,
**kwargs,
) Tensor | dict[source]#

生成 token ID 序列。生成控制参数在 sampling_config 中设置;如果未传入,则会设置为默认值。您可以通过传入相应的参数来覆盖 sampling_config 的任何属性。

参数:
  • batch_input_ids (List[torch.Tensor]) – 输入 ID 张量的列表。每个张量的形状为 (sequence_length, )。

  • sampling_config (SamplingConfig) – 用作生成调用基础参数设置的采样配置。传入的 **kwargs(与 sampling_config 的属性匹配)将覆盖这些属性。如果未提供 sampling_config,将使用默认值。

  • prompt_table (strtorch.Tensor) – 提示表的文件路径(.npy 格式,由 nemo_prompt_convert.py 导出)或提示表本身。

  • prompt_tasks (str) – 输入批次的提示调优任务 ID,格式为逗号分隔列表(例如,0,3,1,0)。

  • lora_uids (list) – 输入批次的 LoRA 权重 UID。使用 -1 禁用 LoRA 模块。

  • streaming (bool) – 生成是否使用流式模式。

  • stopping_criteria (StoppingCriteria) – 自定义停止条件。

  • logits_processor (LogitsProcessor) – 自定义 logits 处理器。

  • medusa_choices (List[List[int]]) – Medusa 解码选项。

  • (Dict[str (kwargs) – sampling_config 的临时参数设置。传入的 **kwargs(与 sampling_config 的属性匹配)将覆盖这些属性。

  • Any] – sampling_config 的临时参数设置。传入的 **kwargs(与 sampling_config 的属性匹配)将覆盖这些属性。

返回:

如果 return_dict=False,方法返回生成的 output_ids。如果 return_dict=True,方法返回一个字典,包含 output_ids、sequence_lengths(如果 sampling_config.output_sequence_lengths=True)、context_logits 和 generation_logits(分别在 self.gather_context_logits=True 和 self.gather_generation_logits=True 时)。

返回类型:

torch.Tensor 或 dict

属性 hidden_size: int#
属性 mapping: Mapping#
属性 max_prompt_embedding_table_size: int#
属性 max_sequence_length: int#
属性 num_heads: int#
属性 num_layers: int#
属性 remove_input_padding: bool#
serialize_engine() IHostMemory[source]#

序列化引擎。

返回:

序列化后的引擎。

返回类型:

bytes

属性 use_lora_plugin: bool#
属性 vocab_size: int#
属性 vocab_size_padded: int#
tensorrt_llm.runtime.ModelRunnerCpp(
executor: Executor,
max_batch_size: int,
max_input_len: int,
max_seq_len: int,
max_beam_width: int,
model_config: ModelConfig,
world_config: WorldConfig,
use_kv_cache: bool,
lora_manager: LoraManager | None = None,
)[source]#

基类: ModelRunnerMixin

一个接口类,它封装了 Executor 并提供生成方法。

属性 dtype: dtype#
类方法 from_dir(
engine_dir: str,
*,
lora_dir: str | None = None,
rank: int = 0,
max_batch_size: int | None = None,
max_input_len: int | None = None,
max_output_len: int | None = None,
max_beam_width: int | None = None,
max_attention_window_size: list[int] | None = None,
sink_token_length: int | None = None,
kv_cache_free_gpu_memory_fraction: float | None = None,
cross_kv_cache_fraction: float | None = None,
medusa_choices: list[list[int]] | None = None,
eagle_choices: list[list[int]] | None = None,
eagle_posterior_threshold: float | None = None,
eagle_use_dynamic_tree: bool = False,
eagle_dynamic_tree_max_top_k: int | None = None,
lookahead_config: list[int] | None = None,
debug_mode: bool = False,
lora_ckpt_source: str = 'hf',
use_gpu_direct_storage: bool = False,
gpu_weights_percent: float = 1,
max_tokens_in_paged_kv_cache: int | None = None,
kv_cache_enable_block_reuse: bool = False,
enable_chunked_context: bool = False,
is_enc_dec: bool = False,
multi_block_mode: bool = True,
enable_context_fmha_fp32_acc: bool | None = None,
cuda_graph_mode: bool | None = None,
logits_processor_map: Dict[str, LogitsProcessor] | None = None,
device_ids: List[int] | None = None,
is_orchestrator_mode: bool = False,
use_runtime_defaults: bool = True,
gather_generation_logits: bool = False,
use_variable_beam_width_search: bool = False,
mm_embedding_offloading: bool = False,
) ModelRunnerCpp[source]#

从引擎目录创建一个 ModelRunnerCpp 实例。

参数:
  • engine_dir (str) – 包含序列化引擎文件和配置文件目录。

  • lora_dir (str) – 包含 LoRA 权重的目录。

  • rank (int) – 运行时进程ID。

  • max_batch_size (int) – 运行时批量大小限制。如果 max_batch_size 不是 None,它不应大于引擎的 max_batch_size;否则将使用引擎的 max_batch_size。

  • max_input_len (int) – 运行时输入长度限制。如果 max_input_len 不是 None,它不应大于引擎的 max_input_len;否则将使用引擎的 max_input_len。

  • max_output_len (int) – 运行时输出长度限制。如果 max_output_len 不是 None,它不应大于引擎的 max_output_len;否则将使用引擎的 max_output_len。

  • max_beam_width (int) – 运行时 beam width 限制。如果 max_beam_width 不是 None,它不应大于引擎的 max_beam_width;否则将使用引擎的 max_beam_width。

  • max_attention_window_size (List[int]) – 控制滑动窗口注意力 / 循环 KV 缓存行为的注意力窗口大小。

  • sink_token_length (int) – sink token 长度,默认为 0。

  • kv_cache_free_gpu_memory_fraction (float) – KV 缓存使用的空闲 GPU 内存比例。

  • cross_kv_cache_fraction (float) – 为交叉注意力保留的 KV 缓存比例,仅应与 enc-dec 模型一起使用。

  • debug_mode (bool) – 是否开启调试模式。

  • medusa_choices (List[List[int]]) – 在 Medusa 解码中使用的 Medusa 选项。

  • eagle_choices (List[List[int]]) – 在 Eagle-1 解码中使用的 Eagle 选项。

  • float (eagle_posterior_threshold) – 典型接受的最小 token 概率阈值。非 None 值将在 Eagle 中启用典型接受。

  • bool (eagle_use_dynamic_tree) – 是否使用 Eagle-2 (动态树)。

  • int (eagle_dynamic_tree_max_top_k) – 在 Eagle-2 中每个节点展开的草稿 token 的最大数量。

  • lora_ckpt_source (str) – checkpoint 的来源。应为 ['hf', 'nemo'] 之一。

  • max_tokens_in_paged_kv_cache (int) – 在 KV 缓存中配置的最大 token 数量。

  • kv_cache_enable_block_reuse (bool) – 在 KV 缓存中启用块重用。

  • enable_chunked_context (bool) – 启用分块上下文。

  • is_enc_dec (bool) – 模型是否为编码器-解码器架构。

  • multi_block_mode (bool) – 是否在 GPU 上将工作分配给多个 CUDA 线程块以进行掩码 MHA 内核计算。

  • enable_context_fmha_fp32_acc (bool) – 启用 FMHA 运行器的 FP32 累加。

  • cuda_graph_mode (bool) – 推理时是否使用 cuda graph。

  • logits_processor_map (Dict[str, LogitsProcessor]) – 一个 logits processor 函数映射,按名称索引。稍后可以将名称提供给 generate() 函数以指定要运行的 logits processor。

  • device_ids (List[int]) – 用于运行 Executor 的设备索引。

  • is_orchestrator_mode (bool) – 运行 model-runner 的模式,默认为 Leader 模式。

  • gather_generation_logits (bool) – 启用收集生成的 logits。

返回:

ModelRunnerCpp 的实例。

返回类型:

ModelRunnerCpp

属性 gather_context_logits: bool#
属性 gather_generation_logits: bool#
generate(
batch_input_ids: List[Tensor],
*,
position_ids: List[Tensor] = None,
encoder_input_ids: List[Tensor] = None,
encoder_input_features: List[Tensor] = None,
encoder_output_lengths: List[int] = None,
cross_attention_masks: List[Tensor] = None,
mrope_params: MropeParams | None = None,
sampling_config: SamplingConfig | None = None,
lora_uids: list | None = None,
lookahead_config: list[int] | None = None,
streaming: bool = False,
stopping_criteria: StoppingCriteria | None = None,
logits_processor_names: list[str] | None = None,
max_new_tokens: int = 1,
end_id: int | None = None,
pad_id: int | None = None,
bad_words_list: list[list[int]] | None = None,
stop_words_list: list[list[int]] | None = None,
return_dict: bool = False,
output_sequence_lengths: bool = False,
output_generation_logits: bool = False,
output_log_probs: bool = False,
output_cum_log_probs: bool = False,
prompt_table: str | Tensor | None = None,
prompt_tasks: str | None = None,
input_token_extra_ids: List[List[int]] = None,
return_all_generated_tokens: bool = False,
language_adapter_uids: List[int] | None = None,
mm_embedding_offloading: bool = False,
**kwargs,
) Tensor | dict[source]#

生成 token ID 序列。生成控制参数在 sampling_config 中设置;如果未传入,则会设置为默认值。您可以通过传入相应的参数来覆盖 sampling_config 的任何属性。

参数:
  • batch_input_ids (List[torch.Tensor]) – 输入 ID 张量的列表。每个张量的形状为 (sequence_length, )。

  • position_ids (List[torch.Tensor]) – position id 张量的列表。每个张量的形状为 (sequence_length, )。

  • encoder_input_ids (List[torch.Tensor]) – 编码器-解码器模型的编码器输入 id 张量列表(可选)。每个张量的形状为 (sequence_length, )。

  • encoder_input_features – (List[torch.Tensor]): 多模态编码器-解码器模型的编码器输入特征张量列表(可选)。每个张量的形状为 (sequence_length, feature_dim)。

  • encoder_output_lengths – (List[int]): 编码器输出长度列表(可选),如果编码器输出长度与编码器输入长度不同(由于卷积下采样等原因)。

  • sampling_config (SamplingConfig) – 用作生成调用的基本参数化的采样配置。传递的与 sampling_config 属性匹配的 **kwargs 将覆盖它们。如果未提供 sampling_config,将使用默认配置。

  • prompt_table (strtorch.Tensor) – 提示表的文件路径(.npy 格式,由 nemo_prompt_convert.py 导出)或提示表本身。

  • prompt_tasks (str) – 输入批次的提示调优任务 ID,格式为逗号分隔列表(例如,0,3,1,0)。

  • input_token_extra_ids (List[List[int]]) – 用于 p-tuning 和 KV 缓存重用一起使用的输入 token 额外 id。

  • lora_uids (list) – 输入批次的 LoRA 权重 UID。使用 -1 禁用 LoRA 模块。

  • streaming (bool) – 生成是否使用流式模式。

  • stopping_criteria (StoppingCriteria) – 自定义停止条件。

  • logits_processor_names (List[str]) – 自定义 logits processor 名称。

  • return_all_generated_tokens (bool) – 在每个流式步骤中是否返回完整输出。

  • (Dict[str (kwargs) – sampling_config 的临时参数化。传递的与 sampling_config 属性匹配的 **kwargs 将覆盖它们。

  • Any] – sampling_config 的临时参数化。传递的与 sampling_config 属性匹配的 **kwargs 将覆盖它们。

返回:

如果 return_dict=False,方法返回生成的 output_ids。如果 return_dict=True,方法返回一个字典,包含 output_ids、sequence_lengths(如果 sampling_config.output_sequence_lengths=True)、context_logits 和 generation_logits(分别在 self.gather_context_logits=True 和 self.gather_generation_logits=True 时)。

返回类型:

torch.Tensor 或 dict

属性 hidden_size: int#
属性 max_prompt_embedding_table_size: int#
属性 max_sequence_length: int#
属性 num_heads: int#
属性 num_layers: int#
属性 remove_input_padding: bool#
属性 vocab_size: int#
属性 vocab_size_padded: int#
tensorrt_llm.runtime.MultimodalModelRunner(args)[source]#

Bases: object

属性 audio_engine_dir#
属性 cpp_e2e#
属性 cpp_llm_only#
generate(
pre_prompt,
post_prompt,
image,
decoder_input_ids,
max_new_tokens,
other_vision_inputs={},
other_audio_inputs={},
other_decoder_inputs={},
)[source]#
get_audio_features(audio, other_audio_inputs)[source]#
get_rope_index(
input_ids: LongTensor,
image_grid_thw: LongTensor | None = None,
video_grid_thw: LongTensor | None = None,
attention_mask: Tensor | None = None,
) Tuple[Tensor, Tensor][source]#

根据图像和视频在 LLM 中的时间、高度和宽度计算 3D rope 索引。

解释

每个嵌入序列包含视觉嵌入和文本嵌入,或者仅包含文本嵌入。

对于纯文本嵌入序列,旋转位置嵌入与现代 LLM 没有区别。示例

input_ids: [T T T T T],其中 T 代表文本。时间 position_ids: [0, 1, 2, 3, 4] 高度 position_ids: [0, 1, 2, 3, 4] 宽度 position_ids: [0, 1, 2, 3, 4]

对于视觉和文本嵌入序列,我们为视觉部分计算 3D 旋转位置嵌入,为文本部分计算 1D 旋转位置嵌入。示例

假设我们有一个视频输入,包含 3 个时间补丁、2 个高度补丁和 2 个宽度补丁。input_ids: [V V V V V V V V V V V V T T T T T],其中 V 代表视觉。视觉时间 position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] 视觉高度 position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] 视觉宽度 position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] 文本时间 position_ids: [3, 4, 5, 6, 7] 文本高度 position_ids: [3, 4, 5, 6, 7] 文本宽度 position_ids: [3, 4, 5, 6, 7] 这里我们将文本开始 position_ids 计算为最大视觉 position_ids 加 1。

参数:
  • input_ids (torch.LongTensor of shape (batch_size, sequence_length)) – 输入序列 token 在词汇表中的索引。如果提供填充,默认情况下将忽略填充。

  • image_grid_thw (torch.LongTensor of shape (num_images, 3)可选) – LLM 中每张图像的特征形状的时间、高度和宽度。

  • video_grid_thw (torch.LongTensor of shape (num_videos, 3)可选) – LLM 中每个视频的特征形状的时间、高度和宽度。

  • attention_mask (torch.Tensor of shape (batch_size, sequence_length)可选) –

    用于避免在填充 token 索引上执行注意力的掩码。掩码值选择范围为 [0, 1]

    • 对于未被掩码的 token 为 1,

    • 对于被掩码的 token 为 0。

返回:

position_ids (torch.LongTensor of shape (3, batch_size, sequence_length)) mrope_position_deltas (torch.Tensor of shape (batch_size))

get_visual_features(image, other_vision_inputs)[source]#
init_audio_encoder()[source]#
init_image_encoder()[source]#
init_llm()[source]#
init_processor()[source]#
init_tokenizer()[source]#
属性 llm_engine_dir#
load_test_audio(audio_path)[source]#
load_test_data(image_path=None, video_path=None)[source]#
prepare_position_ids_for_cogvlm(input_ids)[source]#
preprocess(
pre_prompt,
post_prompt,
image,
other_vision_inputs,
other_audio_inputs,
)[source]#
ptuning_setup(
prompt_table,
input_ids,
input_lengths,
)[source]#
ptuning_setup_fuyu(
input_ids,
image_patches_indices,
)[source]#
ptuning_setup_llava_next(
visual_features,
pre_prompt,
post_prompt,
)[source]#
ptuning_setup_phi3(
visual_features,
audio_features,
input_ids,
num_img_tokens,
num_aud_tokens,
)[source]#
属性 python_e2e#
run(
input_text,
input_image,
input_audio,
max_new_tokens,
)[source]#
setup_fake_prompts(
visual_features,
pre_input_ids,
post_input_ids,
input_lengths,
)[source]#
setup_fake_prompts_qwen2vl(
visual_features,
input_ids,
vision_grid_thws,
attention_mask,
input_lengths,
)[source]#
setup_fake_prompts_vila(
batch_size,
visual_features,
split_input_ids,
input_lengths,
)[source]#
setup_inputs(
input_text,
raw_image,
raw_audio=None,
)[source]#
split_prompt_by_images(tensor)[source]#
静态方法 tokenizer_image_token(
batch_size,
pre_prompt,
post_prompt,
tokenizer,
image_token_index=-200,
)[source]#
video_preprocess(video_path)[source]#
属性 visual_engine_dir#
tensorrt_llm.runtime.QWenForCausalLMGenerationSession(
model_config: ModelConfig,
engine_buffer,
mapping: Mapping,
debug_mode=False,
debug_tensors_to_save=None,
cuda_graph_mode=False,
stream: Stream = None,
global_max_input_length: int = 2048,
global_max_output_length: int = 4096,
)[source]#

Bases: GenerationSession

generate(
input_ids: Tensor,
input_lengths: Tensor,
sampling_config: SamplingConfig,
max_new_tokens: int,
runtime_rank: int = 0,
)[source]#
tensorrt_llm.runtime.SamplingConfig(
end_id: int,
pad_id: int,
max_new_tokens: int = 20,
num_beams: int = 1,
num_return_sequences: int | None = None,
max_attention_window_size: int | None = None,
sink_token_length: int | None = None,
output_sequence_lengths: bool = False,
return_dict: bool = False,
stop_words_list: list | numpy.ndarray | torch.Tensor | NoneType = None,
bad_words_list: list | numpy.ndarray | torch.Tensor | NoneType = None,
temperature: float | torch.Tensor = 1.0,
top_k: int | torch.Tensor = 1,
top_p: float | torch.Tensor = 0.0,
top_p_decay: torch.Tensor | None = None,
top_p_min: torch.Tensor | None = None,
top_p_reset_ids: torch.Tensor | None = None,
random_seed: int | torch.Tensor = None,
length_penalty: float | torch.Tensor = 1.0,
early_stopping: int | torch.Tensor = 1,
repetition_penalty: float | torch.Tensor = 1.0,
min_length: int | torch.Tensor = 1,
presence_penalty: float | torch.Tensor = 0.0,
frequency_penalty: float | torch.Tensor = 0.0,
use_beam_hyps: bool = True,
min_p: float | torch.Tensor = 0.0,
)[source]#

Bases: object

bad_words_list: list | ndarray | Tensor | None = None#
beam_search_diversity_rate: float | Tensor = 0.0#
early_stopping: int | Tensor = 1#
end_id: int#
frequency_penalty: float | Tensor = 0.0#
length_penalty: float | Tensor = 1.0#
max_attention_window_size: int | None = None#
max_new_tokens: int = 20#
min_length: int | Tensor = 1#
min_p: float | Tensor = 0.0#
no_repeat_ngram_size: int | Tensor = None#
num_beams: int = 1#
num_return_sequences: int | None = None#
output_cum_log_probs: bool = False#
output_log_probs: bool = False#
output_sequence_lengths: bool = False#
pad_id: int#
presence_penalty: float | Tensor = 0.0#
random_seed: int | Tensor = None#
repetition_penalty: float | Tensor = 1.0#
return_dict: bool = False#
sink_token_length: int | None = None#
stop_words_list: list | ndarray | Tensor | None = None#
temperature: float | Tensor = 1.0#
top_k: int | Tensor = 1#
top_p: float | Tensor = 0.0#
top_p_decay: Tensor | None = None#
top_p_min: Tensor | None = None#
top_p_reset_ids: Tensor | None = None#
update(**kwargs)[source]#
use_beam_hyps: bool = True#
class tensorrt_llm.runtime.Session(**kwargs)[source]#

Bases: object

会话是一个托管的 TensorRT 运行时。

property context: IExecutionContext#
获取默认的 TensorRT 执行上下文,

如果需要创建新的上下文,请使用 self.engine.create_execution_context()

@return: 一个 TensorRT 执行上下文对象

类型:

@brief

property context_mem_size: int#
property engine: ICudaEngine#
static from_engine(engine) Session[source]#

@brief: 从现有的 ICudaEngine 引擎创建会话 @param engine: 一个 ICudaEngine @return: 一个 Session 对象

static from_serialized_engine(
engine,
) Session[source]#

@brief: 从序列化引擎创建会话 @param engine: 一个序列化引擎 @return: 一个 Session 对象

infer_shapes(
inputs: List[TensorInfo],
context: IExecutionContext | None = None,
) List[TensorInfo][source]#
@brief: 为给定上下文设置输入形状,并从给定的输入形状推断输出形状。

在调用 run() 之前,每当输入形状发生变化时都应调用此函数。或者手动对所有动态形状的输入张量调用 context.set_input_shape。

@param inputs: TensorInfo 对象的列表,每个项表示一个输入张量 @param context: TensorRT 执行上下文,如果为 None,则使用默认上下文 @return: TensorInfo 对象的列表,每个项表示一个输出张量,如果失败则返回 None

run(
inputs: Dict[str, Any],
outputs: Dict[str, Any],
stream,
context=None,
) bool[source]#

@brief: 使用给定的输入和输出运行 TensorRT 引擎 @param inputs: 输入张量的字典,key 是张量名称,value 是张量指针或 torch 张量 @param outputs: 输出张量的字典,key 是张量名称,value 是张量指针或 torch 张量 @param stream: 用于将 TensorRT 引擎排队的 cuda 流 @param context: TensorRT 执行上下文,如果为 None,则使用默认上下文 @return: 如果排队成功则为 True,请注意排队是一个异步调用,

返回 True 不表示执行已完成

property runtime: Runtime#
set_shapes(
tensor_dict: Dict[str, Tensor],
context: IExecutionContext | None = None,
)[source]#
class tensorrt_llm.runtime.StoppingCriteria[source]#

Bases: object

所有可在生成期间应用的停止标准的基类。

class tensorrt_llm.runtime.StoppingCriteriaList(iterable=(), /)[source]#

Bases: list, StoppingCriteria

class tensorrt_llm.runtime.TensorInfo(name: 'str', dtype: 'trt.DataType', shape: 'tuple')[source]#

Bases: object

dtype: DataType#
name: str#
numel()[source]#
shape: tuple#
squeeze(dim=0)[source]#
view(*shape)[source]#
tensorrt_llm.runtime.decode_words_list(
word_dict: List[List[str]],
tokenizer=None,
add_special_tokens=False,
)[source]#
word_dict 的格式

len(word_dict) 应与 batch_size 相同 word_dict[i] 表示第 i 个批次的单词 len(word_dict[i]) >= 1,这意味着它必须至少包含 1 个字符串 例如,word_dict[2] = [” I am happy”, “ I am sad”]。