diff --git a/backend-python/rwkv_pip/webgpu/model.py b/backend-python/rwkv_pip/webgpu/model.py index 0c59528..96078c1 100644 --- a/backend-python/rwkv_pip/webgpu/model.py +++ b/backend-python/rwkv_pip/webgpu/model.py @@ -26,12 +26,19 @@ class RWKV: if s.startswith("layer") ) + chunk_size = ( + int(s.lstrip("chunk")) + for s in strategy.split() + for s in s.split(",") + if s.startswith("chunk") + ) + args = { "file": model_path, "turbo": True, "quant": next(layer, 31) if "i8" in strategy else 0, "quant_nf4": next(layer, 26) if "i4" in strategy else 0, - "token_chunk_size": 128, + "token_chunk_size": next(chunk_size, 32), "lora": None, } self.model = self.wrp.Model(**args) diff --git a/frontend/src/_locales/ja/main.json b/frontend/src/_locales/ja/main.json index dc70b63..419c625 100644 --- a/frontend/src/_locales/ja/main.json +++ b/frontend/src/_locales/ja/main.json @@ -343,5 +343,7 @@ "History Message Number": "履歴メッセージ数", "Send All Message": "すべてのメッセージを送信", "Quantized Layers": "量子化されたレイヤー", - "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。" + "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。", + "Parallel Token Chunk Size": "並列トークンチャンクサイズ", + "Maximum tokens to be processed in parallel at once. For high end GPUs, this could be 64 or 128 (faster).": "一度に並列で処理される最大トークン数。高性能なGPUの場合、64または128になります(高速)。" } \ No newline at end of file diff --git a/frontend/src/_locales/zh-hans/main.json b/frontend/src/_locales/zh-hans/main.json index 93afe52..32d7929 100644 --- a/frontend/src/_locales/zh-hans/main.json +++ b/frontend/src/_locales/zh-hans/main.json @@ -343,5 +343,7 @@ "History Message Number": "历史消息数量", "Send All Message": "发送所有消息", "Quantized Layers": "量化层数", - "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降" + "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降", + "Parallel Token Chunk Size": "并行Token块大小", + "Maximum tokens to be processed in parallel at once. For high end GPUs, this could be 64 or 128 (faster).": "一次最多可以并行处理的token数量. 对于高端显卡, 这可以是64或128 (更快)" } \ No newline at end of file diff --git a/frontend/src/pages/Configs.tsx b/frontend/src/pages/Configs.tsx index 6a503df..e95a09f 100644 --- a/frontend/src/pages/Configs.tsx +++ b/frontend/src/pages/Configs.tsx @@ -331,7 +331,21 @@ const Configs: FC = observer(() => { }} /> } /> } - {selectedConfig.modelParameters.device.startsWith('WebGPU') &&
} + { + selectedConfig.modelParameters.device.startsWith('WebGPU') && + { + setSelectedConfigModelParams({ + tokenChunkSize: data.value + }); + }} /> + } /> + } { selectedConfig.modelParameters.device.startsWith('WebGPU') && strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16'; if (params.quantizedLayers) strategy += ` layer${params.quantizedLayers}`; + if (params.tokenChunkSize) + strategy += ` chunk${params.tokenChunkSize}`; break; case 'CUDA': case 'CUDA-Beta':