allow setting quantizedLayers of WebGPU mode; chore
This commit is contained in:
		
							parent
							
								
									c9513822c9
								
							
						
					
					
						commit
						887ba06bd6
					
				
							
								
								
									
										13
									
								
								backend-python/rwkv_pip/webgpu/model.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										13
									
								
								backend-python/rwkv_pip/webgpu/model.py
									
									
									
									
										vendored
									
									
								
							| @ -19,12 +19,19 @@ class RWKV: | ||||
|         self.version = str(self.info.version).lower() | ||||
|         self.wrp = getattr(wrp, self.version) | ||||
| 
 | ||||
|         layer = ( | ||||
|             int(s.lstrip("layer")) | ||||
|             for s in strategy.split() | ||||
|             for s in s.split(",") | ||||
|             if s.startswith("layer") | ||||
|         ) | ||||
| 
 | ||||
|         args = { | ||||
|             "file": model_path, | ||||
|             "turbo": True, | ||||
|             "quant": 31 if "i8" in strategy else 0, | ||||
|             "quant_nf4": 26 if "i4" in strategy else 0, | ||||
|             "token_chunk_size": 32, | ||||
|             "quant": next(layer, 31) if "i8" in strategy else 0, | ||||
|             "quant_nf4": next(layer, 26) if "i4" in strategy else 0, | ||||
|             "token_chunk_size": 128, | ||||
|             "lora": None, | ||||
|         } | ||||
|         self.model = self.wrp.Model(**args) | ||||
|  | ||||
| @ -341,5 +341,7 @@ | ||||
|   "Load Conversation": "会話を読み込む", | ||||
|   "The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最新のX件のメッセージがサーバーに送信されます。RWKV-Runnerサーバーを使用している場合は、デフォルト値を使用してください。RWKV-Runnerには組み込みの状態キャッシュ管理があり、増分のみを計算します。すべてのメッセージを送信すると、コストが低くなります。ChatGPTを使用している場合は、ChatGPTの費用を削減するために必要に応じてこの値を調整してください。", | ||||
|   "History Message Number": "履歴メッセージ数", | ||||
|   "Send All Message": "すべてのメッセージを送信" | ||||
|   "Send All Message": "すべてのメッセージを送信", | ||||
|   "Quantized Layers": "量子化されたレイヤー", | ||||
|   "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。" | ||||
| } | ||||
| @ -341,5 +341,7 @@ | ||||
|   "Load Conversation": "读取对话", | ||||
|   "The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最近的X条消息会发送至服务器. 如果你正在使用RWKV-Runner服务器, 请使用默认值, 因为RWKV-Runner内置了state缓存管理, 只计算增量, 发送所有消息将具有更低的成本. 如果你正在使用ChatGPT, 则根据你的需要调整此值, 这可以降低ChatGPT的费用", | ||||
|   "History Message Number": "历史消息数量", | ||||
|   "Send All Message": "发送所有消息" | ||||
|   "Send All Message": "发送所有消息", | ||||
|   "Quantized Layers": "量化层数", | ||||
|   "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降" | ||||
| } | ||||
| @ -10,9 +10,10 @@ export const NumberInput: FC<{ | ||||
|   onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void | ||||
|   style?: CSSProperties, | ||||
|   toFixed?: number | ||||
| }> = ({ value, min, max, step, onChange, style, toFixed = 2 }) => { | ||||
|   disabled?: boolean | ||||
| }> = ({ value, min, max, step, onChange, style, toFixed = 2, disabled }) => { | ||||
|   return ( | ||||
|     <Input type="number" style={style} value={value.toString()} min={min} max={max} step={step} | ||||
|     <Input type="number" style={style} value={value.toString()} min={min} max={max} step={step} disabled={disabled} | ||||
|       onChange={(e, data) => { | ||||
|         onChange?.(e, { value: Number(data.value) }); | ||||
|       }} | ||||
|  | ||||
| @ -11,7 +11,8 @@ export const ValuedSlider: FC<{ | ||||
|   input?: boolean | ||||
|   onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void, | ||||
|   toFixed?: number | ||||
| }> = ({ value, min, max, step, input, onChange, toFixed }) => { | ||||
|   disabled?: boolean | ||||
| }> = ({ value, min, max, step, input, onChange, toFixed, disabled }) => { | ||||
|   const sliderRef = useRef<HTMLInputElement>(null); | ||||
|   useEffect(() => { | ||||
|     if (step && sliderRef.current && sliderRef.current.parentElement) { | ||||
| @ -24,10 +25,10 @@ export const ValuedSlider: FC<{ | ||||
|     <div className="flex items-center"> | ||||
|       <Slider ref={sliderRef} className="grow" style={{ minWidth: '50%' }} value={value} min={min} | ||||
|         max={max} step={step} | ||||
|         onChange={onChange} /> | ||||
|         onChange={onChange} disabled={disabled} /> | ||||
|       {input | ||||
|         ? <NumberInput style={{ minWidth: 0 }} value={value} min={min} max={max} step={step} onChange={onChange} | ||||
|           toFixed={toFixed} /> | ||||
|           toFixed={toFixed} disabled={disabled} /> | ||||
|         : <Text>{value}</Text>} | ||||
|     </div> | ||||
|   ); | ||||
|  | ||||
| @ -331,6 +331,26 @@ const Configs: FC = observer(() => { | ||||
|                         }} /> | ||||
|                     } /> | ||||
|                 } | ||||
|                 {selectedConfig.modelParameters.device.startsWith('WebGPU') && <div />} | ||||
|                 { | ||||
|                   selectedConfig.modelParameters.device.startsWith('WebGPU') && | ||||
|                   <Labeled label={t('Quantized Layers')} | ||||
|                     desc={t('Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.')} | ||||
|                     content={ | ||||
|                       <ValuedSlider | ||||
|                         disabled={selectedConfig.modelParameters.precision !== 'int8' && selectedConfig.modelParameters.precision !== 'nf4'} | ||||
|                         value={selectedConfig.modelParameters.precision === 'int8' ? (selectedConfig.modelParameters.quantizedLayers || 31) : | ||||
|                           selectedConfig.modelParameters.precision === 'nf4' ? (selectedConfig.modelParameters.quantizedLayers || 26) : | ||||
|                             selectedConfig.modelParameters.maxStoredLayers | ||||
|                         } min={0} | ||||
|                         max={selectedConfig.modelParameters.maxStoredLayers} step={1} input | ||||
|                         onChange={(e, data) => { | ||||
|                           setSelectedConfigModelParams({ | ||||
|                             quantizedLayers: data.value | ||||
|                           }); | ||||
|                         }} /> | ||||
|                     } /> | ||||
|                 } | ||||
|                 {selectedConfig.modelParameters.device.startsWith('CUDA') && <div />} | ||||
|                 { | ||||
|                   displayStrategyImg && | ||||
|  | ||||
| @ -16,6 +16,7 @@ export type ModelParameters = { | ||||
|   precision: Precision; | ||||
|   storedLayers: number; | ||||
|   maxStoredLayers: number; | ||||
|   quantizedLayers?: number; | ||||
|   useCustomCuda?: boolean; | ||||
|   customStrategy?: string; | ||||
|   useCustomTokenizer?: boolean; | ||||
|  | ||||
| @ -194,6 +194,8 @@ export const getStrategy = (modelConfig: ModelConfig | undefined = undefined) => | ||||
|     case 'WebGPU': | ||||
|     case 'WebGPU (Python)': | ||||
|       strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16'; | ||||
|       if (params.quantizedLayers) | ||||
|         strategy += ` layer${params.quantizedLayers}`; | ||||
|       break; | ||||
|     case 'CUDA': | ||||
|     case 'CUDA-Beta': | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user