allow setting tokenChunkSize of WebGPU mode

This commit is contained in:
josc146 2024-03-02 16:41:29 +08:00
parent c90cefc453
commit d91c3c004d
6 changed files with 32 additions and 4 deletions

View File

@ -26,12 +26,19 @@ class RWKV:
if s.startswith("layer") if s.startswith("layer")
) )
chunk_size = (
int(s.lstrip("chunk"))
for s in strategy.split()
for s in s.split(",")
if s.startswith("chunk")
)
args = { args = {
"file": model_path, "file": model_path,
"turbo": True, "turbo": True,
"quant": next(layer, 31) if "i8" in strategy else 0, "quant": next(layer, 31) if "i8" in strategy else 0,
"quant_nf4": next(layer, 26) if "i4" in strategy else 0, "quant_nf4": next(layer, 26) if "i4" in strategy else 0,
"token_chunk_size": 128, "token_chunk_size": next(chunk_size, 32),
"lora": None, "lora": None,
} }
self.model = self.wrp.Model(**args) self.model = self.wrp.Model(**args)

View File

@ -343,5 +343,7 @@
"History Message Number": "履歴メッセージ数", "History Message Number": "履歴メッセージ数",
"Send All Message": "すべてのメッセージを送信", "Send All Message": "すべてのメッセージを送信",
"Quantized Layers": "量子化されたレイヤー", "Quantized Layers": "量子化されたレイヤー",
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。" "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。",
"Parallel Token Chunk Size": "並列トークンチャンクサイズ",
"Maximum tokens to be processed in parallel at once. For high end GPUs, this could be 64 or 128 (faster).": "一度に並列で処理される最大トークン数。高性能なGPUの場合、64または128になります高速。"
} }

View File

@ -343,5 +343,7 @@
"History Message Number": "历史消息数量", "History Message Number": "历史消息数量",
"Send All Message": "发送所有消息", "Send All Message": "发送所有消息",
"Quantized Layers": "量化层数", "Quantized Layers": "量化层数",
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降" "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降",
"Parallel Token Chunk Size": "并行Token块大小",
"Maximum tokens to be processed in parallel at once. For high end GPUs, this could be 64 or 128 (faster).": "一次最多可以并行处理的token数量. 对于高端显卡, 这可以是64或128 (更快)"
} }

View File

@ -331,7 +331,21 @@ const Configs: FC = observer(() => {
}} /> }} />
} /> } />
} }
{selectedConfig.modelParameters.device.startsWith('WebGPU') && <div />} {
selectedConfig.modelParameters.device.startsWith('WebGPU') &&
<Labeled label={t('Parallel Token Chunk Size')}
desc={t('Maximum tokens to be processed in parallel at once. For high end GPUs, this could be 64 or 128 (faster).')}
content={
<ValuedSlider
value={selectedConfig.modelParameters.tokenChunkSize || 32}
min={16} max={256} step={16} input
onChange={(e, data) => {
setSelectedConfigModelParams({
tokenChunkSize: data.value
});
}} />
} />
}
{ {
selectedConfig.modelParameters.device.startsWith('WebGPU') && selectedConfig.modelParameters.device.startsWith('WebGPU') &&
<Labeled label={t('Quantized Layers')} <Labeled label={t('Quantized Layers')}

View File

@ -17,6 +17,7 @@ export type ModelParameters = {
storedLayers: number; storedLayers: number;
maxStoredLayers: number; maxStoredLayers: number;
quantizedLayers?: number; quantizedLayers?: number;
tokenChunkSize?: number;
useCustomCuda?: boolean; useCustomCuda?: boolean;
customStrategy?: string; customStrategy?: string;
useCustomTokenizer?: boolean; useCustomTokenizer?: boolean;

View File

@ -196,6 +196,8 @@ export const getStrategy = (modelConfig: ModelConfig | undefined = undefined) =>
strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16'; strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16';
if (params.quantizedLayers) if (params.quantizedLayers)
strategy += ` layer${params.quantizedLayers}`; strategy += ` layer${params.quantizedLayers}`;
if (params.tokenChunkSize)
strategy += ` chunk${params.tokenChunkSize}`;
break; break;
case 'CUDA': case 'CUDA':
case 'CUDA-Beta': case 'CUDA-Beta':