support using directory as training data
This commit is contained in:
parent
6fbb86667c
commit
bcb125e168
@ -43,6 +43,52 @@ func (a *App) ConvertData(python string, input string, outputPrefix string, voca
|
|||||||
if strings.Contains(vocab, "rwkv_vocab_v20230424") {
|
if strings.Contains(vocab, "rwkv_vocab_v20230424") {
|
||||||
tokenizerType = "RWKVTokenizer"
|
tokenizerType = "RWKVTokenizer"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
input = strings.TrimSuffix(input, "/")
|
||||||
|
if fi, err := os.Stat(input); err == nil && fi.IsDir() {
|
||||||
|
files, err := os.ReadDir(input)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
jsonlFile, err := os.Create(outputPrefix + ".jsonl")
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer jsonlFile.Close()
|
||||||
|
for _, file := range files {
|
||||||
|
if file.IsDir() || !strings.HasSuffix(file.Name(), ".txt") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
txtFile, err := os.Open(input + "/" + file.Name())
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer txtFile.Close()
|
||||||
|
jsonlFile.WriteString("{\"text\": \"")
|
||||||
|
buf := make([]byte, 1024)
|
||||||
|
for {
|
||||||
|
n, err := txtFile.Read(buf)
|
||||||
|
if err != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// regex replace \r\n \n \r with \\n
|
||||||
|
jsonlFile.WriteString(
|
||||||
|
strings.ReplaceAll(
|
||||||
|
strings.ReplaceAll(
|
||||||
|
strings.ReplaceAll(
|
||||||
|
strings.ReplaceAll(string(buf[:n]),
|
||||||
|
"\r\n", "\\n"),
|
||||||
|
"\n", "\\n"),
|
||||||
|
"\r", "\\n"),
|
||||||
|
"\n\n", "\\n"))
|
||||||
|
}
|
||||||
|
jsonlFile.WriteString("\"}\n")
|
||||||
|
}
|
||||||
|
input = outputPrefix + ".jsonl"
|
||||||
|
} else if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
return Cmd(python, "./finetune/json2binidx_tool/tools/preprocess_data.py", "--input", input, "--output-prefix", outputPrefix, "--vocab", vocab,
|
return Cmd(python, "./finetune/json2binidx_tool/tools/preprocess_data.py", "--input", input, "--output-prefix", outputPrefix, "--vocab", vocab,
|
||||||
"--tokenizer-type", tokenizerType, "--dataset-impl", "mmap", "--append-eod")
|
"--tokenizer-type", tokenizerType, "--dataset-impl", "mmap", "--append-eod")
|
||||||
}
|
}
|
||||||
|
@ -231,5 +231,6 @@
|
|||||||
"You are using WSL 1 for training, please upgrade to WSL 2. e.g. Run \"wsl --set-version Ubuntu-22.04 2\"": "你正在使用WSL 1进行训练,请升级到WSL 2。例如,运行\"wsl --set-version Ubuntu-22.04 2\"",
|
"You are using WSL 1 for training, please upgrade to WSL 2. e.g. Run \"wsl --set-version Ubuntu-22.04 2\"": "你正在使用WSL 1进行训练,请升级到WSL 2。例如,运行\"wsl --set-version Ubuntu-22.04 2\"",
|
||||||
"Matched CUDA is not installed": "未安装匹配的CUDA",
|
"Matched CUDA is not installed": "未安装匹配的CUDA",
|
||||||
"Failed to convert data": "数据转换失败",
|
"Failed to convert data": "数据转换失败",
|
||||||
"Failed to merge model": "合并模型失败"
|
"Failed to merge model": "合并模型失败",
|
||||||
|
"The data path should be a directory or a file in jsonl format (more formats will be supported in the future).\n\nWhen you provide a directory path, all the txt files within that directory will be automatically converted into training data. This is commonly used for large-scale training in writing, code generation, or knowledge bases.\n\nThe jsonl format file can be referenced at https://github.com/Abel2076/json2binidx_tool/blob/main/sample.jsonl.\nYou can also write it similar to OpenAI's playground format, as shown in https://platform.openai.com/playground/p/default-chat.\nEven for multi-turn conversations, they must be written in a single line using `\\n` to indicate line breaks. If they are different dialogues or topics, they should be written in separate lines.": "数据路径必须是一个文件夹,或者jsonl格式文件 (未来会支持更多格式)\n\n当你填写的路径是一个文件夹时,该文件夹内的所有txt文件会被自动转换为训练数据,通常这用于大批量训练写作,代码生成或知识库\n\njsonl文件的格式参考 https://github.com/Abel2076/json2binidx_tool/blob/main/sample.jsonl\n你也可以仿照openai的playground编写,参考 https://platform.openai.com/playground/p/default-chat\n即使是多轮对话也必须写在一行,用`\\n`表示换行,如果是不同对话或主题,则另起一行"
|
||||||
}
|
}
|
@ -11,6 +11,7 @@ import {
|
|||||||
} from '@fluentui/react-components';
|
} from '@fluentui/react-components';
|
||||||
import { ToolTipButton } from './ToolTipButton';
|
import { ToolTipButton } from './ToolTipButton';
|
||||||
import { useTranslation } from 'react-i18next';
|
import { useTranslation } from 'react-i18next';
|
||||||
|
import MarkdownRender from './MarkdownRender';
|
||||||
|
|
||||||
export const DialogButton: FC<{
|
export const DialogButton: FC<{
|
||||||
text?: string | null
|
text?: string | null
|
||||||
@ -19,12 +20,13 @@ export const DialogButton: FC<{
|
|||||||
className?: string,
|
className?: string,
|
||||||
title: string,
|
title: string,
|
||||||
contentText: string,
|
contentText: string,
|
||||||
onConfirm: () => void,
|
markdown?: boolean,
|
||||||
|
onConfirm?: () => void,
|
||||||
size?: 'small' | 'medium' | 'large',
|
size?: 'small' | 'medium' | 'large',
|
||||||
shape?: 'rounded' | 'circular' | 'square',
|
shape?: 'rounded' | 'circular' | 'square',
|
||||||
appearance?: 'secondary' | 'primary' | 'outline' | 'subtle' | 'transparent',
|
appearance?: 'secondary' | 'primary' | 'outline' | 'subtle' | 'transparent',
|
||||||
}> = ({
|
}> = ({
|
||||||
text, icon, tooltip, className, title, contentText,
|
text, icon, tooltip, className, title, contentText, markdown,
|
||||||
onConfirm, size, shape, appearance
|
onConfirm, size, shape, appearance
|
||||||
}) => {
|
}) => {
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
@ -41,7 +43,11 @@ export const DialogButton: FC<{
|
|||||||
<DialogBody>
|
<DialogBody>
|
||||||
<DialogTitle>{title}</DialogTitle>
|
<DialogTitle>{title}</DialogTitle>
|
||||||
<DialogContent>
|
<DialogContent>
|
||||||
{contentText}
|
{
|
||||||
|
markdown ?
|
||||||
|
<MarkdownRender>{contentText}</MarkdownRender> :
|
||||||
|
contentText
|
||||||
|
}
|
||||||
</DialogContent>
|
</DialogContent>
|
||||||
<DialogActions>
|
<DialogActions>
|
||||||
<DialogTrigger disableButtonEnhancement>
|
<DialogTrigger disableButtonEnhancement>
|
||||||
|
@ -39,6 +39,7 @@ import { Line } from 'react-chartjs-2';
|
|||||||
import { ChartJSOrUndefined } from 'react-chartjs-2/dist/types';
|
import { ChartJSOrUndefined } from 'react-chartjs-2/dist/types';
|
||||||
import { WindowShow } from '../../wailsjs/runtime';
|
import { WindowShow } from '../../wailsjs/runtime';
|
||||||
import { t } from 'i18next';
|
import { t } from 'i18next';
|
||||||
|
import { DialogButton } from '../components/DialogButton';
|
||||||
|
|
||||||
ChartJS.register(
|
ChartJS.register(
|
||||||
CategoryScale,
|
CategoryScale,
|
||||||
@ -400,29 +401,36 @@ const LoraFinetune: FC = observer(() => {
|
|||||||
title={t('Data Process')}
|
title={t('Data Process')}
|
||||||
content={
|
content={
|
||||||
<div className="flex flex-col gap-2">
|
<div className="flex flex-col gap-2">
|
||||||
<Labeled flex label={t('Data Path')}
|
<div className="flex gap-2 items-center">
|
||||||
content={
|
{t('Data Path')}
|
||||||
<div className="grow flex gap-2">
|
<Input className="grow" style={{ minWidth: 0 }} value={dataParams.dataPath}
|
||||||
<Input className="grow ml-2" value={dataParams.dataPath}
|
|
||||||
onChange={(e, data) => {
|
onChange={(e, data) => {
|
||||||
setDataParams({ dataPath: data.value });
|
setDataParams({ dataPath: data.value });
|
||||||
}} />
|
}} />
|
||||||
|
<DialogButton text={t('Help')} title={t('Help')} markdown
|
||||||
|
contentText={t('The data path should be a directory or a file in jsonl format (more formats will be supported in the future).\n\n' +
|
||||||
|
'When you provide a directory path, all the txt files within that directory will be automatically converted into training data. ' +
|
||||||
|
'This is commonly used for large-scale training in writing, code generation, or knowledge bases.\n\n' +
|
||||||
|
'The jsonl format file can be referenced at https://github.com/Abel2076/json2binidx_tool/blob/main/sample.jsonl.\n' +
|
||||||
|
'You can also write it similar to OpenAI\'s playground format, as shown in https://platform.openai.com/playground/p/default-chat.\n' +
|
||||||
|
'Even for multi-turn conversations, they must be written in a single line using `\\n` to indicate line breaks. ' +
|
||||||
|
'If they are different dialogues or topics, they should be written in separate lines.')} />
|
||||||
<ToolTipButton desc={t('Open Folder')} icon={<Folder20Regular />} onClick={() => {
|
<ToolTipButton desc={t('Open Folder')} icon={<Folder20Regular />} onClick={() => {
|
||||||
OpenFileFolder(dataParams.dataPath, false);
|
OpenFileFolder(dataParams.dataPath, false);
|
||||||
}} />
|
}} />
|
||||||
</div>
|
</div>
|
||||||
} />
|
|
||||||
<div className="flex gap-2 items-center">
|
<div className="flex gap-2 items-center">
|
||||||
{t('Vocab Path')}
|
{t('Vocab Path')}
|
||||||
<Input className="grow" style={{ minWidth: 0 }} value={dataParams.vocabPath}
|
<Input className="grow" style={{ minWidth: 0 }} value={dataParams.vocabPath}
|
||||||
onChange={(e, data) => {
|
onChange={(e, data) => {
|
||||||
setDataParams({ vocabPath: data.value });
|
setDataParams({ vocabPath: data.value });
|
||||||
}} />
|
}} />
|
||||||
<Button appearance="secondary" size="large" onClick={async () => {
|
<Button appearance="secondary" onClick={async () => {
|
||||||
const ok = await checkDependencies(navigate);
|
const ok = await checkDependencies(navigate);
|
||||||
if (!ok)
|
if (!ok)
|
||||||
return;
|
return;
|
||||||
const outputPrefix = './finetune/json2binidx_tool/data/' + dataParams.dataPath.split(/[\/\\]/).pop()!.split('.')[0];
|
const outputPrefix = './finetune/json2binidx_tool/data/' +
|
||||||
|
dataParams.dataPath.replace(/[\/\\]$/, '').split(/[\/\\]/).pop()!.split('.')[0];
|
||||||
ConvertData(commonStore.settings.customPythonPath, dataParams.dataPath, outputPrefix, dataParams.vocabPath).then(async () => {
|
ConvertData(commonStore.settings.customPythonPath, dataParams.dataPath, outputPrefix, dataParams.vocabPath).then(async () => {
|
||||||
if (!await FileExists(outputPrefix + '_text_document.idx')) {
|
if (!await FileExists(outputPrefix + '_text_document.idx')) {
|
||||||
toast(t('Failed to convert data') + ' - ' + await GetPyError(), { type: 'error' });
|
toast(t('Failed to convert data') + ' - ' + await GetPyError(), { type: 'error' });
|
||||||
|
Loading…
Reference in New Issue
Block a user