diff --git a/backend-python/rwkv_pip/webgpu/model.py b/backend-python/rwkv_pip/webgpu/model.py
index 51a3972..0c59528 100644
--- a/backend-python/rwkv_pip/webgpu/model.py
+++ b/backend-python/rwkv_pip/webgpu/model.py
@@ -19,12 +19,19 @@ class RWKV:
         self.version = str(self.info.version).lower()
         self.wrp = getattr(wrp, self.version)
 
+        layer = (
+            int(s.lstrip("layer"))
+            for s in strategy.split()
+            for s in s.split(",")
+            if s.startswith("layer")
+        )
+
         args = {
             "file": model_path,
             "turbo": True,
-            "quant": 31 if "i8" in strategy else 0,
-            "quant_nf4": 26 if "i4" in strategy else 0,
-            "token_chunk_size": 32,
+            "quant": next(layer, 31) if "i8" in strategy else 0,
+            "quant_nf4": next(layer, 26) if "i4" in strategy else 0,
+            "token_chunk_size": 128,
             "lora": None,
         }
         self.model = self.wrp.Model(**args)
diff --git a/frontend/src/_locales/ja/main.json b/frontend/src/_locales/ja/main.json
index f57494d..dc70b63 100644
--- a/frontend/src/_locales/ja/main.json
+++ b/frontend/src/_locales/ja/main.json
@@ -341,5 +341,7 @@
   "Load Conversation": "会話を読み込む",
   "The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最新のX件のメッセージがサーバーに送信されます。RWKV-Runnerサーバーを使用している場合は、デフォルト値を使用してください。RWKV-Runnerには組み込みの状態キャッシュ管理があり、増分のみを計算します。すべてのメッセージを送信すると、コストが低くなります。ChatGPTを使用している場合は、ChatGPTの費用を削減するために必要に応じてこの値を調整してください。",
   "History Message Number": "履歴メッセージ数",
-  "Send All Message": "すべてのメッセージを送信"
+  "Send All Message": "すべてのメッセージを送信",
+  "Quantized Layers": "量子化されたレイヤー",
+  "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。"
 }
\ No newline at end of file
diff --git a/frontend/src/_locales/zh-hans/main.json b/frontend/src/_locales/zh-hans/main.json
index 0008e2a..93afe52 100644
--- a/frontend/src/_locales/zh-hans/main.json
+++ b/frontend/src/_locales/zh-hans/main.json
@@ -341,5 +341,7 @@
   "Load Conversation": "读取对话",
   "The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最近的X条消息会发送至服务器. 如果你正在使用RWKV-Runner服务器, 请使用默认值, 因为RWKV-Runner内置了state缓存管理, 只计算增量, 发送所有消息将具有更低的成本. 如果你正在使用ChatGPT, 则根据你的需要调整此值, 这可以降低ChatGPT的费用",
   "History Message Number": "历史消息数量",
-  "Send All Message": "发送所有消息"
+  "Send All Message": "发送所有消息",
+  "Quantized Layers": "量化层数",
+  "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降"
 }
\ No newline at end of file
diff --git a/frontend/src/components/NumberInput.tsx b/frontend/src/components/NumberInput.tsx
index 9f38972..dafdaa1 100644
--- a/frontend/src/components/NumberInput.tsx
+++ b/frontend/src/components/NumberInput.tsx
@@ -10,9 +10,10 @@ export const NumberInput: FC<{
   onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void
   style?: CSSProperties,
   toFixed?: number
-}> = ({ value, min, max, step, onChange, style, toFixed = 2 }) => {
+  disabled?: boolean
+}> = ({ value, min, max, step, onChange, style, toFixed = 2, disabled }) => {
   return (
-    <Input type="number" style={style} value={value.toString()} min={min} max={max} step={step}
+    <Input type="number" style={style} value={value.toString()} min={min} max={max} step={step} disabled={disabled}
       onChange={(e, data) => {
         onChange?.(e, { value: Number(data.value) });
       }}
diff --git a/frontend/src/components/ValuedSlider.tsx b/frontend/src/components/ValuedSlider.tsx
index 87314a5..a60cb2e 100644
--- a/frontend/src/components/ValuedSlider.tsx
+++ b/frontend/src/components/ValuedSlider.tsx
@@ -11,7 +11,8 @@ export const ValuedSlider: FC<{
   input?: boolean
   onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void,
   toFixed?: number
-}> = ({ value, min, max, step, input, onChange, toFixed }) => {
+  disabled?: boolean
+}> = ({ value, min, max, step, input, onChange, toFixed, disabled }) => {
   const sliderRef = useRef<HTMLInputElement>(null);
   useEffect(() => {
     if (step && sliderRef.current && sliderRef.current.parentElement) {
@@ -24,10 +25,10 @@ export const ValuedSlider: FC<{
     <div className="flex items-center">
       <Slider ref={sliderRef} className="grow" style={{ minWidth: '50%' }} value={value} min={min}
         max={max} step={step}
-        onChange={onChange} />
+        onChange={onChange} disabled={disabled} />
       {input
         ? <NumberInput style={{ minWidth: 0 }} value={value} min={min} max={max} step={step} onChange={onChange}
-          toFixed={toFixed} />
+          toFixed={toFixed} disabled={disabled} />
         : <Text>{value}</Text>}
     </div>
   );
diff --git a/frontend/src/pages/Configs.tsx b/frontend/src/pages/Configs.tsx
index 19a7941..6a503df 100644
--- a/frontend/src/pages/Configs.tsx
+++ b/frontend/src/pages/Configs.tsx
@@ -331,6 +331,26 @@ const Configs: FC = observer(() => {
                         }} />
                     } />
                 }
+                {selectedConfig.modelParameters.device.startsWith('WebGPU') && <div />}
+                {
+                  selectedConfig.modelParameters.device.startsWith('WebGPU') &&
+                  <Labeled label={t('Quantized Layers')}
+                    desc={t('Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.')}
+                    content={
+                      <ValuedSlider
+                        disabled={selectedConfig.modelParameters.precision !== 'int8' && selectedConfig.modelParameters.precision !== 'nf4'}
+                        value={selectedConfig.modelParameters.precision === 'int8' ? (selectedConfig.modelParameters.quantizedLayers || 31) :
+                          selectedConfig.modelParameters.precision === 'nf4' ? (selectedConfig.modelParameters.quantizedLayers || 26) :
+                            selectedConfig.modelParameters.maxStoredLayers
+                        } min={0}
+                        max={selectedConfig.modelParameters.maxStoredLayers} step={1} input
+                        onChange={(e, data) => {
+                          setSelectedConfigModelParams({
+                            quantizedLayers: data.value
+                          });
+                        }} />
+                    } />
+                }
                 {selectedConfig.modelParameters.device.startsWith('CUDA') && <div />}
                 {
                   displayStrategyImg &&
diff --git a/frontend/src/types/configs.ts b/frontend/src/types/configs.ts
index 4ee5f97..2b2015c 100644
--- a/frontend/src/types/configs.ts
+++ b/frontend/src/types/configs.ts
@@ -16,6 +16,7 @@ export type ModelParameters = {
   precision: Precision;
   storedLayers: number;
   maxStoredLayers: number;
+  quantizedLayers?: number;
   useCustomCuda?: boolean;
   customStrategy?: string;
   useCustomTokenizer?: boolean;
diff --git a/frontend/src/utils/index.tsx b/frontend/src/utils/index.tsx
index 5ddc70c..1575150 100644
--- a/frontend/src/utils/index.tsx
+++ b/frontend/src/utils/index.tsx
@@ -194,6 +194,8 @@ export const getStrategy = (modelConfig: ModelConfig | undefined = undefined) =>
     case 'WebGPU':
     case 'WebGPU (Python)':
       strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16';
+      if (params.quantizedLayers)
+        strategy += ` layer${params.quantizedLayers}`;
       break;
     case 'CUDA':
     case 'CUDA-Beta':