allow setting tokenChunkSize of WebGPU mode
This commit is contained in:
@@ -343,5 +343,7 @@
|
||||
"History Message Number": "履歴メッセージ数",
|
||||
"Send All Message": "すべてのメッセージを送信",
|
||||
"Quantized Layers": "量子化されたレイヤー",
|
||||
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。"
|
||||
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。",
|
||||
"Parallel Token Chunk Size": "並列トークンチャンクサイズ",
|
||||
"Maximum tokens to be processed in parallel at once. For high end GPUs, this could be 64 or 128 (faster).": "一度に並列で処理される最大トークン数。高性能なGPUの場合、64または128になります(高速)。"
|
||||
}
|
||||
@@ -343,5 +343,7 @@
|
||||
"History Message Number": "历史消息数量",
|
||||
"Send All Message": "发送所有消息",
|
||||
"Quantized Layers": "量化层数",
|
||||
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降"
|
||||
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降",
|
||||
"Parallel Token Chunk Size": "并行Token块大小",
|
||||
"Maximum tokens to be processed in parallel at once. For high end GPUs, this could be 64 or 128 (faster).": "一次最多可以并行处理的token数量. 对于高端显卡, 这可以是64或128 (更快)"
|
||||
}
|
||||
@@ -331,7 +331,21 @@ const Configs: FC = observer(() => {
|
||||
}} />
|
||||
} />
|
||||
}
|
||||
{selectedConfig.modelParameters.device.startsWith('WebGPU') && <div />}
|
||||
{
|
||||
selectedConfig.modelParameters.device.startsWith('WebGPU') &&
|
||||
<Labeled label={t('Parallel Token Chunk Size')}
|
||||
desc={t('Maximum tokens to be processed in parallel at once. For high end GPUs, this could be 64 or 128 (faster).')}
|
||||
content={
|
||||
<ValuedSlider
|
||||
value={selectedConfig.modelParameters.tokenChunkSize || 32}
|
||||
min={16} max={256} step={16} input
|
||||
onChange={(e, data) => {
|
||||
setSelectedConfigModelParams({
|
||||
tokenChunkSize: data.value
|
||||
});
|
||||
}} />
|
||||
} />
|
||||
}
|
||||
{
|
||||
selectedConfig.modelParameters.device.startsWith('WebGPU') &&
|
||||
<Labeled label={t('Quantized Layers')}
|
||||
|
||||
@@ -17,6 +17,7 @@ export type ModelParameters = {
|
||||
storedLayers: number;
|
||||
maxStoredLayers: number;
|
||||
quantizedLayers?: number;
|
||||
tokenChunkSize?: number;
|
||||
useCustomCuda?: boolean;
|
||||
customStrategy?: string;
|
||||
useCustomTokenizer?: boolean;
|
||||
|
||||
@@ -196,6 +196,8 @@ export const getStrategy = (modelConfig: ModelConfig | undefined = undefined) =>
|
||||
strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16';
|
||||
if (params.quantizedLayers)
|
||||
strategy += ` layer${params.quantizedLayers}`;
|
||||
if (params.tokenChunkSize)
|
||||
strategy += ` chunk${params.tokenChunkSize}`;
|
||||
break;
|
||||
case 'CUDA':
|
||||
case 'CUDA-Beta':
|
||||
|
||||
Reference in New Issue
Block a user