allow setting quantizedLayers of WebGPU mode; chore

This commit is contained in:
josc146 2024-03-01 14:23:05 +08:00
parent c9513822c9
commit 887ba06bd6
8 changed files with 46 additions and 10 deletions

View File

@ -19,12 +19,19 @@ class RWKV:
self.version = str(self.info.version).lower()
self.wrp = getattr(wrp, self.version)
layer = (
int(s.lstrip("layer"))
for s in strategy.split()
for s in s.split(",")
if s.startswith("layer")
)
args = {
"file": model_path,
"turbo": True,
"quant": 31 if "i8" in strategy else 0,
"quant_nf4": 26 if "i4" in strategy else 0,
"token_chunk_size": 32,
"quant": next(layer, 31) if "i8" in strategy else 0,
"quant_nf4": next(layer, 26) if "i4" in strategy else 0,
"token_chunk_size": 128,
"lora": None,
}
self.model = self.wrp.Model(**args)

View File

@ -341,5 +341,7 @@
"Load Conversation": "会話を読み込む",
"The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最新のX件のメッセージがサーバーに送信されます。RWKV-Runnerサーバーを使用している場合は、デフォルト値を使用してください。RWKV-Runnerには組み込みの状態キャッシュ管理があり、増分のみを計算します。すべてのメッセージを送信すると、コストが低くなります。ChatGPTを使用している場合は、ChatGPTの費用を削減するために必要に応じてこの値を調整してください。",
"History Message Number": "履歴メッセージ数",
"Send All Message": "すべてのメッセージを送信"
"Send All Message": "すべてのメッセージを送信",
"Quantized Layers": "量子化されたレイヤー",
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "現在の精度で量子化されたニューラルネットワークのレイヤーの数、量子化するほどVRAMの使用量が低くなりますが、品質も相応に低下します。"
}

View File

@ -341,5 +341,7 @@
"Load Conversation": "读取对话",
"The latest X messages will be sent to the server. If you are using the RWKV-Runner server, please use the default value because RWKV-Runner has built-in state cache management which only calculates increments. Sending all messages will have lower cost. If you are using ChatGPT, adjust this value according to your needs to reduce ChatGPT expenses.": "最近的X条消息会发送至服务器. 如果你正在使用RWKV-Runner服务器, 请使用默认值, 因为RWKV-Runner内置了state缓存管理, 只计算增量, 发送所有消息将具有更低的成本. 如果你正在使用ChatGPT, 则根据你的需要调整此值, 这可以降低ChatGPT的费用",
"History Message Number": "历史消息数量",
"Send All Message": "发送所有消息"
"Send All Message": "发送所有消息",
"Quantized Layers": "量化层数",
"Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.": "神经网络以当前精度量化的层数, 量化越多, 占用显存越低, 但质量相应下降"
}

View File

@ -10,9 +10,10 @@ export const NumberInput: FC<{
onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void
style?: CSSProperties,
toFixed?: number
}> = ({ value, min, max, step, onChange, style, toFixed = 2 }) => {
disabled?: boolean
}> = ({ value, min, max, step, onChange, style, toFixed = 2, disabled }) => {
return (
<Input type="number" style={style} value={value.toString()} min={min} max={max} step={step}
<Input type="number" style={style} value={value.toString()} min={min} max={max} step={step} disabled={disabled}
onChange={(e, data) => {
onChange?.(e, { value: Number(data.value) });
}}

View File

@ -11,7 +11,8 @@ export const ValuedSlider: FC<{
input?: boolean
onChange?: (ev: React.ChangeEvent<HTMLInputElement>, data: SliderOnChangeData) => void,
toFixed?: number
}> = ({ value, min, max, step, input, onChange, toFixed }) => {
disabled?: boolean
}> = ({ value, min, max, step, input, onChange, toFixed, disabled }) => {
const sliderRef = useRef<HTMLInputElement>(null);
useEffect(() => {
if (step && sliderRef.current && sliderRef.current.parentElement) {
@ -24,10 +25,10 @@ export const ValuedSlider: FC<{
<div className="flex items-center">
<Slider ref={sliderRef} className="grow" style={{ minWidth: '50%' }} value={value} min={min}
max={max} step={step}
onChange={onChange} />
onChange={onChange} disabled={disabled} />
{input
? <NumberInput style={{ minWidth: 0 }} value={value} min={min} max={max} step={step} onChange={onChange}
toFixed={toFixed} />
toFixed={toFixed} disabled={disabled} />
: <Text>{value}</Text>}
</div>
);

View File

@ -331,6 +331,26 @@ const Configs: FC = observer(() => {
}} />
} />
}
{selectedConfig.modelParameters.device.startsWith('WebGPU') && <div />}
{
selectedConfig.modelParameters.device.startsWith('WebGPU') &&
<Labeled label={t('Quantized Layers')}
desc={t('Number of the neural network layers quantized with current precision, the more you quantize, the lower the VRAM usage, but the quality correspondingly decreases.')}
content={
<ValuedSlider
disabled={selectedConfig.modelParameters.precision !== 'int8' && selectedConfig.modelParameters.precision !== 'nf4'}
value={selectedConfig.modelParameters.precision === 'int8' ? (selectedConfig.modelParameters.quantizedLayers || 31) :
selectedConfig.modelParameters.precision === 'nf4' ? (selectedConfig.modelParameters.quantizedLayers || 26) :
selectedConfig.modelParameters.maxStoredLayers
} min={0}
max={selectedConfig.modelParameters.maxStoredLayers} step={1} input
onChange={(e, data) => {
setSelectedConfigModelParams({
quantizedLayers: data.value
});
}} />
} />
}
{selectedConfig.modelParameters.device.startsWith('CUDA') && <div />}
{
displayStrategyImg &&

View File

@ -16,6 +16,7 @@ export type ModelParameters = {
precision: Precision;
storedLayers: number;
maxStoredLayers: number;
quantizedLayers?: number;
useCustomCuda?: boolean;
customStrategy?: string;
useCustomTokenizer?: boolean;

View File

@ -194,6 +194,8 @@ export const getStrategy = (modelConfig: ModelConfig | undefined = undefined) =>
case 'WebGPU':
case 'WebGPU (Python)':
strategy += params.precision === 'nf4' ? 'fp16i4' : params.precision === 'int8' ? 'fp16i8' : 'fp16';
if (params.quantizedLayers)
strategy += ` layer${params.quantizedLayers}`;
break;
case 'CUDA':
case 'CUDA-Beta':