allow setting quantizedLayers of WebGPU mode; chore
This commit is contained in:
parent
c9513822c9
commit
887ba06bd6
13
backend-python/rwkv_pip/webgpu/model.py
vendored
13
backend-python/rwkv_pip/webgpu/model.py
vendored
@ -19,12 +19,19 @@ class RWKV:
|
||||
self.version = str(self.info.version).lower()
|
||||
self.wrp = getattr(wrp, self.version)
|
||||
|
||||
layer = (
|
||||
int(s.lstrip("layer"))
|
||||
for s in strategy.split()
|
||||
for s in s.split(",")
|
||||
if s.startswith("layer")
|
||||
)
|
||||
|
||||
args = {
|
||||
"file": model_path,
|
||||
"turbo": True,
|
||||
"quant": 31 if "i8" in strategy else 0,
|
||||
"quant_nf4": 26 if "i4" in strategy else 0,
|
||||
"token_chunk_size": 32,
|
||||
"quant": next(layer, 31) if "i8" in strategy else 0,
|
||||
"quant_nf4": next(layer, 26) if "i4" in strategy else 0,
|
||||
"token_chunk_size": 128,
|
||||
"lora": None,
|
||||
}
|
||||
self.model = self.wrp.Model(**args)
|
||||
|
@ -341,5 +341,7 @@
|
||||
"Load Conversation": "会話を読み込む",
|
||||
"The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最新のX件のメッセージがサーバーに送信されます。RWKV-Runnerサーバーを使用している場合は、デフォルト値を使用してください。RWKV-Runnerには組み込みの状態キャッシュ管理があり、増分のみを計算します。すべてのメッセージを送信すると、コストが低くなります。ChatGPTを使用している場合は、ChatGPTの費用を削減するために必要に応じてこの値を調整してください。",
|
||||
"History Message Number": "履歴メッセージ数",
|
||||
"Send All Message": "すべてのメッセージを送信"
|
||||
"Send All Message": "すべてのメッセージを送信",
|
||||
"Quantized Layers": "量子化されたレイヤー",
|
||||
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。"
|
||||
}
|
@ -341,5 +341,7 @@
|
||||
"Load Conversation": "读取对话",
|
||||
"The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最近的X条消息会发送至服务器. 如果你正在使用RWKV-Runner服务器, 请使用默认值, 因为RWKV-Runner内置了state缓存管理, 只计算增量, 发送所有消息将具有更低的成本. 如果你正在使用ChatGPT, 则根据你的需要调整此值, 这可以降低ChatGPT的费用",
|
||||
"History Message Number": "历史消息数量",
|
||||
"Send All Message": "发送所有消息"
|
||||
"Send All Message": "发送所有消息",
|
||||
"Quantized Layers": "量化层数",
|
||||
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降"
|
||||
}
|
@ -10,9 +10,10 @@ export const NumberInput: FC<{
|
||||
onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void
|
||||
style?: CSSProperties,
|
||||
toFixed?: number
|
||||
}> = ({ value, min, max, step, onChange, style, toFixed = 2 }) => {
|
||||
disabled?: boolean
|
||||
}> = ({ value, min, max, step, onChange, style, toFixed = 2, disabled }) => {
|
||||
return (
|
||||
<Input type="number" style={style} value={value.toString()} min={min} max={max} step={step}
|
||||
<Input type="number" style={style} value={value.toString()} min={min} max={max} step={step} disabled={disabled}
|
||||
onChange={(e, data) => {
|
||||
onChange?.(e, { value: Number(data.value) });
|
||||
}}
|
||||
|
@ -11,7 +11,8 @@ export const ValuedSlider: FC<{
|
||||
input?: boolean
|
||||
onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void,
|
||||
toFixed?: number
|
||||
}> = ({ value, min, max, step, input, onChange, toFixed }) => {
|
||||
disabled?: boolean
|
||||
}> = ({ value, min, max, step, input, onChange, toFixed, disabled }) => {
|
||||
const sliderRef = useRef<HTMLInputElement>(null);
|
||||
useEffect(() => {
|
||||
if (step && sliderRef.current && sliderRef.current.parentElement) {
|
||||
@ -24,10 +25,10 @@ export const ValuedSlider: FC<{
|
||||
<div className="flex items-center">
|
||||
<Slider ref={sliderRef} className="grow" style={{ minWidth: '50%' }} value={value} min={min}
|
||||
max={max} step={step}
|
||||
onChange={onChange} />
|
||||
onChange={onChange} disabled={disabled} />
|
||||
{input
|
||||
? <NumberInput style={{ minWidth: 0 }} value={value} min={min} max={max} step={step} onChange={onChange}
|
||||
toFixed={toFixed} />
|
||||
toFixed={toFixed} disabled={disabled} />
|
||||
: <Text>{value}</Text>}
|
||||
</div>
|
||||
);
|
||||
|
@ -331,6 +331,26 @@ const Configs: FC = observer(() => {
|
||||
}} />
|
||||
} />
|
||||
}
|
||||
{selectedConfig.modelParameters.device.startsWith('WebGPU') && <div />}
|
||||
{
|
||||
selectedConfig.modelParameters.device.startsWith('WebGPU') &&
|
||||
<Labeled label={t('Quantized Layers')}
|
||||
desc={t('Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.')}
|
||||
content={
|
||||
<ValuedSlider
|
||||
disabled={selectedConfig.modelParameters.precision !== 'int8' && selectedConfig.modelParameters.precision !== 'nf4'}
|
||||
value={selectedConfig.modelParameters.precision === 'int8' ? (selectedConfig.modelParameters.quantizedLayers || 31) :
|
||||
selectedConfig.modelParameters.precision === 'nf4' ? (selectedConfig.modelParameters.quantizedLayers || 26) :
|
||||
selectedConfig.modelParameters.maxStoredLayers
|
||||
} min={0}
|
||||
max={selectedConfig.modelParameters.maxStoredLayers} step={1} input
|
||||
onChange={(e, data) => {
|
||||
setSelectedConfigModelParams({
|
||||
quantizedLayers: data.value
|
||||
});
|
||||
}} />
|
||||
} />
|
||||
}
|
||||
{selectedConfig.modelParameters.device.startsWith('CUDA') && <div />}
|
||||
{
|
||||
displayStrategyImg &&
|
||||
|
@ -16,6 +16,7 @@ export type ModelParameters = {
|
||||
precision: Precision;
|
||||
storedLayers: number;
|
||||
maxStoredLayers: number;
|
||||
quantizedLayers?: number;
|
||||
useCustomCuda?: boolean;
|
||||
customStrategy?: string;
|
||||
useCustomTokenizer?: boolean;
|
||||
|
@ -194,6 +194,8 @@ export const getStrategy = (modelConfig: ModelConfig | undefined = undefined) =>
|
||||
case 'WebGPU':
|
||||
case 'WebGPU (Python)':
|
||||
strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16';
|
||||
if (params.quantizedLayers)
|
||||
strategy += ` layer${params.quantizedLayers}`;
|
||||
break;
|
||||
case 'CUDA':
|
||||
case 'CUDA-Beta':
|
||||
|
Loading…
Reference in New Issue
Block a user