diff --git a/backend-python/rwkv_pip/webgpu/model.py b/backend-python/rwkv_pip/webgpu/model.py index 51a3972..0c59528 100644 --- a/backend-python/rwkv_pip/webgpu/model.py +++ b/backend-python/rwkv_pip/webgpu/model.py @@ -19,12 +19,19 @@ class RWKV: self.version = str(self.info.version).lower() self.wrp = getattr(wrp, self.version) + layer = ( + int(s.lstrip("layer")) + for s in strategy.split() + for s in s.split(",") + if s.startswith("layer") + ) + args = { "file": model_path, "turbo": True, - "quant": 31 if "i8" in strategy else 0, - "quant_nf4": 26 if "i4" in strategy else 0, - "token_chunk_size": 32, + "quant": next(layer, 31) if "i8" in strategy else 0, + "quant_nf4": next(layer, 26) if "i4" in strategy else 0, + "token_chunk_size": 128, "lora": None, } self.model = self.wrp.Model(**args) diff --git a/frontend/src/_locales/ja/main.json b/frontend/src/_locales/ja/main.json index f57494d..dc70b63 100644 --- a/frontend/src/_locales/ja/main.json +++ b/frontend/src/_locales/ja/main.json @@ -341,5 +341,7 @@ "Load Conversation": "会話を読み込む", "The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最新のX件のメッセージがサーバーに送信されます。RWKV-Runnerサーバーを使用している場合は、デフォルト値を使用してください。RWKV-Runnerには組み込みの状態キャッシュ管理があり、増分のみを計算します。すべてのメッセージを送信すると、コストが低くなります。ChatGPTを使用している場合は、ChatGPTの費用を削減するために必要に応じてこの値を調整してください。", "History Message Number": "履歴メッセージ数", - "Send All Message": "すべてのメッセージを送信" + "Send All Message": "すべてのメッセージを送信", + "Quantized Layers": "量子化されたレイヤー", + "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。" } \ No newline at end of file diff --git a/frontend/src/_locales/zh-hans/main.json b/frontend/src/_locales/zh-hans/main.json index 0008e2a..93afe52 100644 --- a/frontend/src/_locales/zh-hans/main.json +++ b/frontend/src/_locales/zh-hans/main.json @@ -341,5 +341,7 @@ "Load Conversation": "读取对话", "The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最近的X条消息会发送至服务器. 如果你正在使用RWKV-Runner服务器, 请使用默认值, 因为RWKV-Runner内置了state缓存管理, 只计算增量, 发送所有消息将具有更低的成本. 如果你正在使用ChatGPT, 则根据你的需要调整此值, 这可以降低ChatGPT的费用", "History Message Number": "历史消息数量", - "Send All Message": "发送所有消息" + "Send All Message": "发送所有消息", + "Quantized Layers": "量化层数", + "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降" } \ No newline at end of file diff --git a/frontend/src/components/NumberInput.tsx b/frontend/src/components/NumberInput.tsx index 9f38972..dafdaa1 100644 --- a/frontend/src/components/NumberInput.tsx +++ b/frontend/src/components/NumberInput.tsx @@ -10,9 +10,10 @@ export const NumberInput: FC<{ onChange?: (ev: React.ChangeEvent, data: SliderOnChangeData) => void style?: CSSProperties, toFixed?: number -}> = ({ value, min, max, step, onChange, style, toFixed = 2 }) => { + disabled?: boolean +}> = ({ value, min, max, step, onChange, style, toFixed = 2, disabled }) => { return ( - { onChange?.(e, { value: Number(data.value) }); }} diff --git a/frontend/src/components/ValuedSlider.tsx b/frontend/src/components/ValuedSlider.tsx index 87314a5..a60cb2e 100644 --- a/frontend/src/components/ValuedSlider.tsx +++ b/frontend/src/components/ValuedSlider.tsx @@ -11,7 +11,8 @@ export const ValuedSlider: FC<{ input?: boolean onChange?: (ev: React.ChangeEvent, data: SliderOnChangeData) => void, toFixed?: number -}> = ({ value, min, max, step, input, onChange, toFixed }) => { + disabled?: boolean +}> = ({ value, min, max, step, input, onChange, toFixed, disabled }) => { const sliderRef = useRef(null); useEffect(() => { if (step && sliderRef.current && sliderRef.current.parentElement) { @@ -24,10 +25,10 @@ export const ValuedSlider: FC<{
+ onChange={onChange} disabled={disabled} /> {input ? + toFixed={toFixed} disabled={disabled} /> : {value}}
); diff --git a/frontend/src/pages/Configs.tsx b/frontend/src/pages/Configs.tsx index 19a7941..6a503df 100644 --- a/frontend/src/pages/Configs.tsx +++ b/frontend/src/pages/Configs.tsx @@ -331,6 +331,26 @@ const Configs: FC = observer(() => { }} /> } /> } + {selectedConfig.modelParameters.device.startsWith('WebGPU') &&
} + { + selectedConfig.modelParameters.device.startsWith('WebGPU') && + { + setSelectedConfigModelParams({ + quantizedLayers: data.value + }); + }} /> + } /> + } {selectedConfig.modelParameters.device.startsWith('CUDA') &&
} { displayStrategyImg && diff --git a/frontend/src/types/configs.ts b/frontend/src/types/configs.ts index 4ee5f97..2b2015c 100644 --- a/frontend/src/types/configs.ts +++ b/frontend/src/types/configs.ts @@ -16,6 +16,7 @@ export type ModelParameters = { precision: Precision; storedLayers: number; maxStoredLayers: number; + quantizedLayers?: number; useCustomCuda?: boolean; customStrategy?: string; useCustomTokenizer?: boolean; diff --git a/frontend/src/utils/index.tsx b/frontend/src/utils/index.tsx index 5ddc70c..1575150 100644 --- a/frontend/src/utils/index.tsx +++ b/frontend/src/utils/index.tsx @@ -194,6 +194,8 @@ export const getStrategy = (modelConfig: ModelConfig | undefined = undefined) => case 'WebGPU': case 'WebGPU (Python)': strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16'; + if (params.quantizedLayers) + strategy += ` layer${params.quantizedLayers}`; break; case 'CUDA': case 'CUDA-Beta':