allow setting quantizedLayers of WebGPU mode; chore
This commit is contained in:
parent
c9513822c9
commit
887ba06bd6
13
backend-python/rwkv_pip/webgpu/model.py
vendored
13
backend-python/rwkv_pip/webgpu/model.py
vendored
@ -19,12 +19,19 @@ class RWKV:
|
|||||||
self.version = str(self.info.version).lower()
|
self.version = str(self.info.version).lower()
|
||||||
self.wrp = getattr(wrp, self.version)
|
self.wrp = getattr(wrp, self.version)
|
||||||
|
|
||||||
|
layer = (
|
||||||
|
int(s.lstrip("layer"))
|
||||||
|
for s in strategy.split()
|
||||||
|
for s in s.split(",")
|
||||||
|
if s.startswith("layer")
|
||||||
|
)
|
||||||
|
|
||||||
args = {
|
args = {
|
||||||
"file": model_path,
|
"file": model_path,
|
||||||
"turbo": True,
|
"turbo": True,
|
||||||
"quant": 31 if "i8" in strategy else 0,
|
"quant": next(layer, 31) if "i8" in strategy else 0,
|
||||||
"quant_nf4": 26 if "i4" in strategy else 0,
|
"quant_nf4": next(layer, 26) if "i4" in strategy else 0,
|
||||||
"token_chunk_size": 32,
|
"token_chunk_size": 128,
|
||||||
"lora": None,
|
"lora": None,
|
||||||
}
|
}
|
||||||
self.model = self.wrp.Model(**args)
|
self.model = self.wrp.Model(**args)
|
||||||
|
@ -341,5 +341,7 @@
|
|||||||
"Load Conversation": "会話を読み込む",
|
"Load Conversation": "会話を読み込む",
|
||||||
"The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最新のX件のメッセージがサーバーに送信されます。RWKV-Runnerサーバーを使用している場合は、デフォルト値を使用してください。RWKV-Runnerには組み込みの状態キャッシュ管理があり、増分のみを計算します。すべてのメッセージを送信すると、コストが低くなります。ChatGPTを使用している場合は、ChatGPTの費用を削減するために必要に応じてこの値を調整してください。",
|
"The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最新のX件のメッセージがサーバーに送信されます。RWKV-Runnerサーバーを使用している場合は、デフォルト値を使用してください。RWKV-Runnerには組み込みの状態キャッシュ管理があり、増分のみを計算します。すべてのメッセージを送信すると、コストが低くなります。ChatGPTを使用している場合は、ChatGPTの費用を削減するために必要に応じてこの値を調整してください。",
|
||||||
"History Message Number": "履歴メッセージ数",
|
"History Message Number": "履歴メッセージ数",
|
||||||
"Send All Message": "すべてのメッセージを送信"
|
"Send All Message": "すべてのメッセージを送信",
|
||||||
|
"Quantized Layers": "量子化されたレイヤー",
|
||||||
|
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。"
|
||||||
}
|
}
|
@ -341,5 +341,7 @@
|
|||||||
"Load Conversation": "读取对话",
|
"Load Conversation": "读取对话",
|
||||||
"The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最近的X条消息会发送至服务器. 如果你正在使用RWKV-Runner服务器, 请使用默认值, 因为RWKV-Runner内置了state缓存管理, 只计算增量, 发送所有消息将具有更低的成本. 如果你正在使用ChatGPT, 则根据你的需要调整此值, 这可以降低ChatGPT的费用",
|
"The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最近的X条消息会发送至服务器. 如果你正在使用RWKV-Runner服务器, 请使用默认值, 因为RWKV-Runner内置了state缓存管理, 只计算增量, 发送所有消息将具有更低的成本. 如果你正在使用ChatGPT, 则根据你的需要调整此值, 这可以降低ChatGPT的费用",
|
||||||
"History Message Number": "历史消息数量",
|
"History Message Number": "历史消息数量",
|
||||||
"Send All Message": "发送所有消息"
|
"Send All Message": "发送所有消息",
|
||||||
|
"Quantized Layers": "量化层数",
|
||||||
|
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降"
|
||||||
}
|
}
|
@ -10,9 +10,10 @@ export const NumberInput: FC<{
|
|||||||
onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void
|
onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void
|
||||||
style?: CSSProperties,
|
style?: CSSProperties,
|
||||||
toFixed?: number
|
toFixed?: number
|
||||||
}> = ({ value, min, max, step, onChange, style, toFixed = 2 }) => {
|
disabled?: boolean
|
||||||
|
}> = ({ value, min, max, step, onChange, style, toFixed = 2, disabled }) => {
|
||||||
return (
|
return (
|
||||||
<Input type="number" style={style} value={value.toString()} min={min} max={max} step={step}
|
<Input type="number" style={style} value={value.toString()} min={min} max={max} step={step} disabled={disabled}
|
||||||
onChange={(e, data) => {
|
onChange={(e, data) => {
|
||||||
onChange?.(e, { value: Number(data.value) });
|
onChange?.(e, { value: Number(data.value) });
|
||||||
}}
|
}}
|
||||||
|
@ -11,7 +11,8 @@ export const ValuedSlider: FC<{
|
|||||||
input?: boolean
|
input?: boolean
|
||||||
onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void,
|
onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void,
|
||||||
toFixed?: number
|
toFixed?: number
|
||||||
}> = ({ value, min, max, step, input, onChange, toFixed }) => {
|
disabled?: boolean
|
||||||
|
}> = ({ value, min, max, step, input, onChange, toFixed, disabled }) => {
|
||||||
const sliderRef = useRef<HTMLInputElement>(null);
|
const sliderRef = useRef<HTMLInputElement>(null);
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (step && sliderRef.current && sliderRef.current.parentElement) {
|
if (step && sliderRef.current && sliderRef.current.parentElement) {
|
||||||
@ -24,10 +25,10 @@ export const ValuedSlider: FC<{
|
|||||||
<div className="flex items-center">
|
<div className="flex items-center">
|
||||||
<Slider ref={sliderRef} className="grow" style={{ minWidth: '50%' }} value={value} min={min}
|
<Slider ref={sliderRef} className="grow" style={{ minWidth: '50%' }} value={value} min={min}
|
||||||
max={max} step={step}
|
max={max} step={step}
|
||||||
onChange={onChange} />
|
onChange={onChange} disabled={disabled} />
|
||||||
{input
|
{input
|
||||||
? <NumberInput style={{ minWidth: 0 }} value={value} min={min} max={max} step={step} onChange={onChange}
|
? <NumberInput style={{ minWidth: 0 }} value={value} min={min} max={max} step={step} onChange={onChange}
|
||||||
toFixed={toFixed} />
|
toFixed={toFixed} disabled={disabled} />
|
||||||
: <Text>{value}</Text>}
|
: <Text>{value}</Text>}
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
|
@ -331,6 +331,26 @@ const Configs: FC = observer(() => {
|
|||||||
}} />
|
}} />
|
||||||
} />
|
} />
|
||||||
}
|
}
|
||||||
|
{selectedConfig.modelParameters.device.startsWith('WebGPU') && <div />}
|
||||||
|
{
|
||||||
|
selectedConfig.modelParameters.device.startsWith('WebGPU') &&
|
||||||
|
<Labeled label={t('Quantized Layers')}
|
||||||
|
desc={t('Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.')}
|
||||||
|
content={
|
||||||
|
<ValuedSlider
|
||||||
|
disabled={selectedConfig.modelParameters.precision !== 'int8' && selectedConfig.modelParameters.precision !== 'nf4'}
|
||||||
|
value={selectedConfig.modelParameters.precision === 'int8' ? (selectedConfig.modelParameters.quantizedLayers || 31) :
|
||||||
|
selectedConfig.modelParameters.precision === 'nf4' ? (selectedConfig.modelParameters.quantizedLayers || 26) :
|
||||||
|
selectedConfig.modelParameters.maxStoredLayers
|
||||||
|
} min={0}
|
||||||
|
max={selectedConfig.modelParameters.maxStoredLayers} step={1} input
|
||||||
|
onChange={(e, data) => {
|
||||||
|
setSelectedConfigModelParams({
|
||||||
|
quantizedLayers: data.value
|
||||||
|
});
|
||||||
|
}} />
|
||||||
|
} />
|
||||||
|
}
|
||||||
{selectedConfig.modelParameters.device.startsWith('CUDA') && <div />}
|
{selectedConfig.modelParameters.device.startsWith('CUDA') && <div />}
|
||||||
{
|
{
|
||||||
displayStrategyImg &&
|
displayStrategyImg &&
|
||||||
|
@ -16,6 +16,7 @@ export type ModelParameters = {
|
|||||||
precision: Precision;
|
precision: Precision;
|
||||||
storedLayers: number;
|
storedLayers: number;
|
||||||
maxStoredLayers: number;
|
maxStoredLayers: number;
|
||||||
|
quantizedLayers?: number;
|
||||||
useCustomCuda?: boolean;
|
useCustomCuda?: boolean;
|
||||||
customStrategy?: string;
|
customStrategy?: string;
|
||||||
useCustomTokenizer?: boolean;
|
useCustomTokenizer?: boolean;
|
||||||
|
@ -194,6 +194,8 @@ export const getStrategy = (modelConfig: ModelConfig | undefined = undefined) =>
|
|||||||
case 'WebGPU':
|
case 'WebGPU':
|
||||||
case 'WebGPU (Python)':
|
case 'WebGPU (Python)':
|
||||||
strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16';
|
strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16';
|
||||||
|
if (params.quantizedLayers)
|
||||||
|
strategy += ` layer${params.quantizedLayers}`;
|
||||||
break;
|
break;
|
||||||
case 'CUDA':
|
case 'CUDA':
|
||||||
case 'CUDA-Beta':
|
case 'CUDA-Beta':
|
||||||
|
Loading…
Reference in New Issue
Block a user