allow setting quantizedLayers of WebGPU mode; chore

2024-03-01 14:23:05 +08:00 · 2024-03-01 14:23:05 +08:00 · 887ba06bd6
commit 887ba06bd6
parent c9513822c9
8 changed files with 46 additions and 10 deletions
--- a/backend-python/rwkv_pip/webgpu/model.py
+++ b/backend-python/rwkv_pip/webgpu/model.py
@ -19,12 +19,19 @@ class RWKV:
        self.version = str(self.info.version).lower()
        self.wrp = getattr(wrp, self.version)

+        layer = (
+            int(s.lstrip("layer"))
+            for s in strategy.split()
+            for s in s.split(",")
+            if s.startswith("layer")
+        )
+
        args = {
            "file": model_path,
            "turbo": True,
-            "quant": 31 if "i8" in strategy else 0,
-            "quant_nf4": 26 if "i4" in strategy else 0,
-            "token_chunk_size": 32,
+            "quant": next(layer, 31) if "i8" in strategy else 0,
+            "quant_nf4": next(layer, 26) if "i4" in strategy else 0,
+            "token_chunk_size": 128,
            "lora": None,
        }
        self.model = self.wrp.Model(**args)
--- a/frontend/src/_locales/ja/main.json
+++ b/frontend/src/_locales/ja/main.json
@ -341,5 +341,7 @@
  "Load Conversation": "会話を読み込む",
  "The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最新のX件のメッセージがサーバーに送信されます。RWKV-Runnerサーバーを使用している場合は、デフォルト値を使用してください。RWKV-Runnerには組み込みの状態キャッシュ管理があり、増分のみを計算します。すべてのメッセージを送信すると、コストが低くなります。ChatGPTを使用している場合は、ChatGPTの費用を削減するために必要に応じてこの値を調整してください。",
  "History Message Number": "履歴メッセージ数",
-  "Send All Message": "すべてのメッセージを送信"
+  "Send All Message": "すべてのメッセージを送信",
+  "Quantized Layers": "量子化されたレイヤー",
+  "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。"
 }
--- a/frontend/src/_locales/zh-hans/main.json
+++ b/frontend/src/_locales/zh-hans/main.json
@ -341,5 +341,7 @@
  "Load Conversation": "读取对话",
  "The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最近的X条消息会发送至服务器. 如果你正在使用RWKV-Runner服务器, 请使用默认值, 因为RWKV-Runner内置了state缓存管理, 只计算增量, 发送所有消息将具有更低的成本. 如果你正在使用ChatGPT, 则根据你的需要调整此值, 这可以降低ChatGPT的费用",
  "History Message Number": "历史消息数量",
-  "Send All Message": "发送所有消息"
+  "Send All Message": "发送所有消息",
+  "Quantized Layers": "量化层数",
+  "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降"
 }
--- a/frontend/src/components/NumberInput.tsx
+++ b/frontend/src/components/NumberInput.tsx
@ -10,9 +10,10 @@ export const NumberInput: FC<{
  onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void
  style?: CSSProperties,
  toFixed?: number
-}> = ({ value, min, max, step, onChange, style, toFixed = 2 }) => {
+  disabled?: boolean
+}> = ({ value, min, max, step, onChange, style, toFixed = 2, disabled }) => {
  return (
-    <Input type="number" style={style} value={value.toString()} min={min} max={max} step={step}
+    <Input type="number" style={style} value={value.toString()} min={min} max={max} step={step} disabled={disabled}
      onChange={(e, data) => {
        onChange?.(e, { value: Number(data.value) });
      }}
--- a/frontend/src/components/ValuedSlider.tsx
+++ b/frontend/src/components/ValuedSlider.tsx
@ -11,7 +11,8 @@ export const ValuedSlider: FC<{
  input?: boolean
  onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void,
  toFixed?: number
-}> = ({ value, min, max, step, input, onChange, toFixed }) => {
+  disabled?: boolean
+}> = ({ value, min, max, step, input, onChange, toFixed, disabled }) => {
  const sliderRef = useRef<HTMLInputElement>(null);
  useEffect(() => {
    if (step && sliderRef.current && sliderRef.current.parentElement) {
@ -24,10 +25,10 @@ export const ValuedSlider: FC<{
    <div className="flex items-center">
      <Slider ref={sliderRef} className="grow" style={{ minWidth: '50%' }} value={value} min={min}
        max={max} step={step}
-        onChange={onChange} />
+        onChange={onChange} disabled={disabled} />
      {input
        ? <NumberInput style={{ minWidth: 0 }} value={value} min={min} max={max} step={step} onChange={onChange}
-          toFixed={toFixed} />
+          toFixed={toFixed} disabled={disabled} />
        : <Text>{value}</Text>}
    </div>
  );
--- a/frontend/src/pages/Configs.tsx
+++ b/frontend/src/pages/Configs.tsx
@ -331,6 +331,26 @@ const Configs: FC = observer(() => {
                        }} />
                    } />
                }
+                {selectedConfig.modelParameters.device.startsWith('WebGPU') && <div />}
+                {
+                  selectedConfig.modelParameters.device.startsWith('WebGPU') &&
+                  <Labeled label={t('Quantized Layers')}
+                    desc={t('Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.')}
+                    content={
+                      <ValuedSlider
+                        disabled={selectedConfig.modelParameters.precision !== 'int8' && selectedConfig.modelParameters.precision !== 'nf4'}
+                        value={selectedConfig.modelParameters.precision === 'int8' ? (selectedConfig.modelParameters.quantizedLayers || 31) :
+                          selectedConfig.modelParameters.precision === 'nf4' ? (selectedConfig.modelParameters.quantizedLayers || 26) :
+                            selectedConfig.modelParameters.maxStoredLayers
+                        } min={0}
+                        max={selectedConfig.modelParameters.maxStoredLayers} step={1} input
+                        onChange={(e, data) => {
+                          setSelectedConfigModelParams({
+                            quantizedLayers: data.value
+                          });
+                        }} />
+                    } />
+                }
                {selectedConfig.modelParameters.device.startsWith('CUDA') && <div />}
                {
                  displayStrategyImg &&
--- a/frontend/src/types/configs.ts
+++ b/frontend/src/types/configs.ts
@ -16,6 +16,7 @@ export type ModelParameters = {
  precision: Precision;
  storedLayers: number;
  maxStoredLayers: number;
+  quantizedLayers?: number;
  useCustomCuda?: boolean;
  customStrategy?: string;
  useCustomTokenizer?: boolean;
--- a/frontend/src/utils/index.tsx
+++ b/frontend/src/utils/index.tsx
@ -194,6 +194,8 @@ export const getStrategy = (modelConfig: ModelConfig | undefined = undefined) =>
    case 'WebGPU':
    case 'WebGPU (Python)':
      strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16';
+      if (params.quantizedLayers)
+        strategy += ` layer${params.quantizedLayers}`;
      break;
    case 'CUDA':
    case 'CUDA-Beta':