From bcb125e1683b48dff381ef5c7c52936730761c0b Mon Sep 17 00:00:00 2001 From: josc146 Date: Fri, 7 Jul 2023 21:57:01 +0800 Subject: [PATCH] support using directory as training data --- backend-golang/rwkv.go | 46 ++++++++++++++++++++++++ frontend/src/_locales/zh-hans/main.json | 3 +- frontend/src/components/DialogButton.tsx | 12 +++++-- frontend/src/pages/Train.tsx | 36 +++++++++++-------- 4 files changed, 79 insertions(+), 18 deletions(-) diff --git a/backend-golang/rwkv.go b/backend-golang/rwkv.go index e47dde1..fa980b7 100644 --- a/backend-golang/rwkv.go +++ b/backend-golang/rwkv.go @@ -43,6 +43,52 @@ func (a *App) ConvertData(python string, input string, outputPrefix string, voca if strings.Contains(vocab, "rwkv_vocab_v20230424") { tokenizerType = "RWKVTokenizer" } + + input = strings.TrimSuffix(input, "/") + if fi, err := os.Stat(input); err == nil && fi.IsDir() { + files, err := os.ReadDir(input) + if err != nil { + return "", err + } + jsonlFile, err := os.Create(outputPrefix + ".jsonl") + if err != nil { + return "", err + } + defer jsonlFile.Close() + for _, file := range files { + if file.IsDir() || !strings.HasSuffix(file.Name(), ".txt") { + continue + } + txtFile, err := os.Open(input + "/" + file.Name()) + if err != nil { + return "", err + } + defer txtFile.Close() + jsonlFile.WriteString("{\"text\": \"") + buf := make([]byte, 1024) + for { + n, err := txtFile.Read(buf) + if err != nil { + break + } + // regex replace \r\n \n \r with \\n + jsonlFile.WriteString( + strings.ReplaceAll( + strings.ReplaceAll( + strings.ReplaceAll( + strings.ReplaceAll(string(buf[:n]), + "\r\n", "\\n"), + "\n", "\\n"), + "\r", "\\n"), + "\n\n", "\\n")) + } + jsonlFile.WriteString("\"}\n") + } + input = outputPrefix + ".jsonl" + } else if err != nil { + return "", err + } + return Cmd(python, "./finetune/json2binidx_tool/tools/preprocess_data.py", "--input", input, "--output-prefix", outputPrefix, "--vocab", vocab, "--tokenizer-type", tokenizerType, "--dataset-impl", "mmap", "--append-eod") } diff --git a/frontend/src/_locales/zh-hans/main.json b/frontend/src/_locales/zh-hans/main.json index f715244..ba328fa 100644 --- a/frontend/src/_locales/zh-hans/main.json +++ b/frontend/src/_locales/zh-hans/main.json @@ -231,5 +231,6 @@ "You are using WSL 1 for training, please upgrade to WSL 2. e.g. Run \"wsl --set-version Ubuntu-22.04 2\"": "你正在使用WSL 1进行训练,请升级到WSL 2。例如,运行\"wsl --set-version Ubuntu-22.04 2\"", "Matched CUDA is not installed": "未安装匹配的CUDA", "Failed to convert data": "数据转换失败", - "Failed to merge model": "合并模型失败" + "Failed to merge model": "合并模型失败", + "The data path should be a directory or a file in jsonl format (more formats will be supported in the future).\n\nWhen you provide a directory path, all the txt files within that directory will be automatically converted into training data. This is commonly used for large-scale training in writing, code generation, or knowledge bases.\n\nThe jsonl format file can be referenced at https://github.com/Abel2076/json2binidx_tool/blob/main/sample.jsonl.\nYou can also write it similar to OpenAI's playground format, as shown in https://platform.openai.com/playground/p/default-chat.\nEven for multi-turn conversations, they must be written in a single line using `\\n` to indicate line breaks. If they are different dialogues or topics, they should be written in separate lines.": "数据路径必须是一个文件夹,或者jsonl格式文件 (未来会支持更多格式)\n\n当你填写的路径是一个文件夹时,该文件夹内的所有txt文件会被自动转换为训练数据,通常这用于大批量训练写作,代码生成或知识库\n\njsonl文件的格式参考 https://github.com/Abel2076/json2binidx_tool/blob/main/sample.jsonl\n你也可以仿照openai的playground编写,参考 https://platform.openai.com/playground/p/default-chat\n即使是多轮对话也必须写在一行,用`\\n`表示换行,如果是不同对话或主题,则另起一行" } \ No newline at end of file diff --git a/frontend/src/components/DialogButton.tsx b/frontend/src/components/DialogButton.tsx index 338fb92..0886fae 100644 --- a/frontend/src/components/DialogButton.tsx +++ b/frontend/src/components/DialogButton.tsx @@ -11,6 +11,7 @@ import { } from '@fluentui/react-components'; import { ToolTipButton } from './ToolTipButton'; import { useTranslation } from 'react-i18next'; +import MarkdownRender from './MarkdownRender'; export const DialogButton: FC<{ text?: string | null @@ -19,12 +20,13 @@ export const DialogButton: FC<{ className?: string, title: string, contentText: string, - onConfirm: () => void, + markdown?: boolean, + onConfirm?: () => void, size?: 'small' | 'medium' | 'large', shape?: 'rounded' | 'circular' | 'square', appearance?: 'secondary' | 'primary' | 'outline' | 'subtle' | 'transparent', }> = ({ - text, icon, tooltip, className, title, contentText, + text, icon, tooltip, className, title, contentText, markdown, onConfirm, size, shape, appearance }) => { const { t } = useTranslation(); @@ -41,7 +43,11 @@ export const DialogButton: FC<{ {title} - {contentText} + { + markdown ? + {contentText} : + contentText + } diff --git a/frontend/src/pages/Train.tsx b/frontend/src/pages/Train.tsx index ce216a0..cad1101 100644 --- a/frontend/src/pages/Train.tsx +++ b/frontend/src/pages/Train.tsx @@ -39,6 +39,7 @@ import { Line } from 'react-chartjs-2'; import { ChartJSOrUndefined } from 'react-chartjs-2/dist/types'; import { WindowShow } from '../../wailsjs/runtime'; import { t } from 'i18next'; +import { DialogButton } from '../components/DialogButton'; ChartJS.register( CategoryScale, @@ -400,29 +401,36 @@ const LoraFinetune: FC = observer(() => { title={t('Data Process')} content={
- - { - setDataParams({ dataPath: data.value }); - }} /> - } onClick={() => { - OpenFileFolder(dataParams.dataPath, false); - }} /> -
- } /> +
+ {t('Data Path')} + { + setDataParams({ dataPath: data.value }); + }} /> + + } onClick={() => { + OpenFileFolder(dataParams.dataPath, false); + }} /> +
{t('Vocab Path')} { setDataParams({ vocabPath: data.value }); }} /> -