allow setting tokenChunkSize of WebGPU mode
This commit is contained in:
		
							parent
							
								
									c90cefc453
								
							
						
					
					
						commit
						d91c3c004d
					
				
							
								
								
									
										9
									
								
								backend-python/rwkv_pip/webgpu/model.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										9
									
								
								backend-python/rwkv_pip/webgpu/model.py
									
									
									
									
										vendored
									
									
								
							| @ -26,12 +26,19 @@ class RWKV: | |||||||
|             if s.startswith("layer") |             if s.startswith("layer") | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|  |         chunk_size = ( | ||||||
|  |             int(s.lstrip("chunk")) | ||||||
|  |             for s in strategy.split() | ||||||
|  |             for s in s.split(",") | ||||||
|  |             if s.startswith("chunk") | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|         args = { |         args = { | ||||||
|             "file": model_path, |             "file": model_path, | ||||||
|             "turbo": True, |             "turbo": True, | ||||||
|             "quant": next(layer, 31) if "i8" in strategy else 0, |             "quant": next(layer, 31) if "i8" in strategy else 0, | ||||||
|             "quant_nf4": next(layer, 26) if "i4" in strategy else 0, |             "quant_nf4": next(layer, 26) if "i4" in strategy else 0, | ||||||
|             "token_chunk_size": 128, |             "token_chunk_size": next(chunk_size, 32), | ||||||
|             "lora": None, |             "lora": None, | ||||||
|         } |         } | ||||||
|         self.model = self.wrp.Model(**args) |         self.model = self.wrp.Model(**args) | ||||||
|  | |||||||
| @ -343,5 +343,7 @@ | |||||||
|   "History Message Number": "履歴メッセージ数", |   "History Message Number": "履歴メッセージ数", | ||||||
|   "Send All Message": "すべてのメッセージを送信", |   "Send All Message": "すべてのメッセージを送信", | ||||||
|   "Quantized Layers": "量子化されたレイヤー", |   "Quantized Layers": "量子化されたレイヤー", | ||||||
|   "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。" |   "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。", | ||||||
|  |   "Parallel Token Chunk Size": "並列トークンチャンクサイズ", | ||||||
|  |   "Maximum tokens to be processed in parallel at once. For high end GPUs, this could be 64 or 128 (faster).": "一度に並列で処理される最大トークン数。高性能なGPUの場合、64または128になります(高速)。" | ||||||
| } | } | ||||||
| @ -343,5 +343,7 @@ | |||||||
|   "History Message Number": "历史消息数量", |   "History Message Number": "历史消息数量", | ||||||
|   "Send All Message": "发送所有消息", |   "Send All Message": "发送所有消息", | ||||||
|   "Quantized Layers": "量化层数", |   "Quantized Layers": "量化层数", | ||||||
|   "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降" |   "Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降", | ||||||
|  |   "Parallel Token Chunk Size": "并行Token块大小", | ||||||
|  |   "Maximum tokens to be processed in parallel at once. For high end GPUs, this could be 64 or 128 (faster).": "一次最多可以并行处理的token数量. 对于高端显卡, 这可以是64或128 (更快)" | ||||||
| } | } | ||||||
| @ -331,7 +331,21 @@ const Configs: FC = observer(() => { | |||||||
|                         }} /> |                         }} /> | ||||||
|                     } /> |                     } /> | ||||||
|                 } |                 } | ||||||
|                 {selectedConfig.modelParameters.device.startsWith('WebGPU') && <div />} |                 { | ||||||
|  |                   selectedConfig.modelParameters.device.startsWith('WebGPU') && | ||||||
|  |                   <Labeled label={t('Parallel Token Chunk Size')} | ||||||
|  |                     desc={t('Maximum tokens to be processed in parallel at once. For high end GPUs, this could be 64 or 128 (faster).')} | ||||||
|  |                     content={ | ||||||
|  |                       <ValuedSlider | ||||||
|  |                         value={selectedConfig.modelParameters.tokenChunkSize || 32} | ||||||
|  |                         min={16} max={256} step={16} input | ||||||
|  |                         onChange={(e, data) => { | ||||||
|  |                           setSelectedConfigModelParams({ | ||||||
|  |                             tokenChunkSize: data.value | ||||||
|  |                           }); | ||||||
|  |                         }} /> | ||||||
|  |                     } /> | ||||||
|  |                 } | ||||||
|                 { |                 { | ||||||
|                   selectedConfig.modelParameters.device.startsWith('WebGPU') && |                   selectedConfig.modelParameters.device.startsWith('WebGPU') && | ||||||
|                   <Labeled label={t('Quantized Layers')} |                   <Labeled label={t('Quantized Layers')} | ||||||
|  | |||||||
| @ -17,6 +17,7 @@ export type ModelParameters = { | |||||||
|   storedLayers: number; |   storedLayers: number; | ||||||
|   maxStoredLayers: number; |   maxStoredLayers: number; | ||||||
|   quantizedLayers?: number; |   quantizedLayers?: number; | ||||||
|  |   tokenChunkSize?: number; | ||||||
|   useCustomCuda?: boolean; |   useCustomCuda?: boolean; | ||||||
|   customStrategy?: string; |   customStrategy?: string; | ||||||
|   useCustomTokenizer?: boolean; |   useCustomTokenizer?: boolean; | ||||||
|  | |||||||
| @ -196,6 +196,8 @@ export const getStrategy = (modelConfig: ModelConfig | undefined = undefined) => | |||||||
|       strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16'; |       strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16'; | ||||||
|       if (params.quantizedLayers) |       if (params.quantizedLayers) | ||||||
|         strategy += ` layer${params.quantizedLayers}`; |         strategy += ` layer${params.quantizedLayers}`; | ||||||
|  |       if (params.tokenChunkSize) | ||||||
|  |         strategy += ` chunk${params.tokenChunkSize}`; | ||||||
|       break; |       break; | ||||||
|     case 'CUDA': |     case 'CUDA': | ||||||
|     case 'CUDA-Beta': |     case 'CUDA-Beta': | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user