support using directory as training data

2023-07-07 21:57:01 +08:00
parent 6fbb86667c
commit bcb125e168
4 changed files with 79 additions and 18 deletions
--- a/backend-golang/rwkv.go
+++ b/backend-golang/rwkv.go
@@ -43,6 +43,52 @@ func (a *App) ConvertData(python string, input string, outputPrefix string, voca
 	if strings.Contains(vocab, "rwkv_vocab_v20230424") {
 		tokenizerType = "RWKVTokenizer"
 	}
+
+	input = strings.TrimSuffix(input, "/")
+	if fi, err := os.Stat(input); err == nil && fi.IsDir() {
+		files, err := os.ReadDir(input)
+		if err != nil {
+			return "", err
+		}
+		jsonlFile, err := os.Create(outputPrefix + ".jsonl")
+		if err != nil {
+			return "", err
+		}
+		defer jsonlFile.Close()
+		for _, file := range files {
+			if file.IsDir() || !strings.HasSuffix(file.Name(), ".txt") {
+				continue
+			}
+			txtFile, err := os.Open(input + "/" + file.Name())
+			if err != nil {
+				return "", err
+			}
+			defer txtFile.Close()
+			jsonlFile.WriteString("{\"text\": \"")
+			buf := make([]byte, 1024)
+			for {
+				n, err := txtFile.Read(buf)
+				if err != nil {
+					break
+				}
+				// regex replace \r\n \n \r with \\n
+				jsonlFile.WriteString(
+					strings.ReplaceAll(
+						strings.ReplaceAll(
+							strings.ReplaceAll(
+								strings.ReplaceAll(string(buf[:n]),
+									"\r\n", "\\n"),
+								"\n", "\\n"),
+							"\r", "\\n"),
+						"\n\n", "\\n"))
+			}
+			jsonlFile.WriteString("\"}\n")
+		}
+		input = outputPrefix + ".jsonl"
+	} else if err != nil {
+		return "", err
+	}
+
 	return Cmd(python, "./finetune/json2binidx_tool/tools/preprocess_data.py", "--input", input, "--output-prefix", outputPrefix, "--vocab", vocab,
 		"--tokenizer-type", tokenizerType, "--dataset-impl", "mmap", "--append-eod")
 }
--- a/frontend/src/_locales/zh-hans/main.json
+++ b/frontend/src/_locales/zh-hans/main.json
@@ -231,5 +231,6 @@
  "You are using WSL 1 for training, please upgrade to WSL 2. e.g. Run \"wsl --set-version Ubuntu-22.04 2\"": "你正在使用WSL 1进行训练，请升级到WSL 2。例如，运行\"wsl --set-version Ubuntu-22.04 2\"",
  "Matched CUDA is not installed": "未安装匹配的CUDA",
  "Failed to convert data": "数据转换失败",
-  "Failed to merge model": "合并模型失败"
+  "Failed to merge model": "合并模型失败",
+  "The data path should be a directory or a file in jsonl format (more formats will be supported in the future).\n\nWhen you provide a directory path, all the txt files within that directory will be automatically converted into training data. This is commonly used for large-scale training in writing, code generation, or knowledge bases.\n\nThe jsonl format file can be referenced at https://github.com/Abel2076/json2binidx_tool/blob/main/sample.jsonl.\nYou can also write it similar to OpenAI's playground format, as shown in https://platform.openai.com/playground/p/default-chat.\nEven for multi-turn conversations, they must be written in a single line using `\\n` to indicate line breaks. If they are different dialogues or topics, they should be written in separate lines.": "数据路径必须是一个文件夹，或者jsonl格式文件 (未来会支持更多格式)\n\n当你填写的路径是一个文件夹时，该文件夹内的所有txt文件会被自动转换为训练数据，通常这用于大批量训练写作，代码生成或知识库\n\njsonl文件的格式参考 https://github.com/Abel2076/json2binidx_tool/blob/main/sample.jsonl\n你也可以仿照openai的playground编写，参考 https://platform.openai.com/playground/p/default-chat\n即使是多轮对话也必须写在一行，用`\\n`表示换行，如果是不同对话或主题，则另起一行"
 }
--- a/frontend/src/components/DialogButton.tsx
+++ b/frontend/src/components/DialogButton.tsx
@@ -11,6 +11,7 @@ import {
 } from '@fluentui/react-components';
 import { ToolTipButton } from './ToolTipButton';
 import { useTranslation } from 'react-i18next';
+import MarkdownRender from './MarkdownRender';

 export const DialogButton: FC<{
  text?: string | null
@@ -19,12 +20,13 @@ export const DialogButton: FC<{
  className?: string,
  title: string,
  contentText: string,
-  onConfirm: () => void,
+  markdown?: boolean,
+  onConfirm?: () => void,
  size?: 'small' | 'medium' | 'large',
  shape?: 'rounded' | 'circular' | 'square',
  appearance?: 'secondary' | 'primary' | 'outline' | 'subtle' | 'transparent',
 }> = ({
-  text, icon, tooltip, className, title, contentText,
+  text, icon, tooltip, className, title, contentText, markdown,
  onConfirm, size, shape, appearance
 }) => {
  const { t } = useTranslation();
@@ -41,7 +43,11 @@ export const DialogButton: FC<{
      <DialogBody>
        <DialogTitle>{title}</DialogTitle>
        <DialogContent>
-          {contentText}
+          {
+            markdown ?
+              <MarkdownRender>{contentText}</MarkdownRender> :
+              contentText
+          }
        </DialogContent>
        <DialogActions>
          <DialogTrigger disableButtonEnhancement>
--- a/frontend/src/pages/Train.tsx
+++ b/frontend/src/pages/Train.tsx
@@ -39,6 +39,7 @@ import { Line } from 'react-chartjs-2';
 import { ChartJSOrUndefined } from 'react-chartjs-2/dist/types';
 import { WindowShow } from '../../wailsjs/runtime';
 import { t } from 'i18next';
+import { DialogButton } from '../components/DialogButton';

 ChartJS.register(
  CategoryScale,
@@ -400,29 +401,36 @@ const LoraFinetune: FC = observer(() => {
          title={t('Data Process')}
          content={
            <div className="flex flex-col gap-2">
-              <Labeled flex label={t('Data Path')}
-                content={
-                  <div className="grow flex gap-2">
-                    <Input className="grow ml-2" value={dataParams.dataPath}
+              <div className="flex gap-2 items-center">
+                {t('Data Path')}
+                <Input className="grow" style={{ minWidth: 0 }} value={dataParams.dataPath}
                  onChange={(e, data) => {
                    setDataParams({ dataPath: data.value });
                  }} />
+                <DialogButton text={t('Help')} title={t('Help')} markdown
+                  contentText={t('The data path should be a directory or a file in jsonl format (more formats will be supported in the future).\n\n' +
+                    'When you provide a directory path, all the txt files within that directory will be automatically converted into training data. ' +
+                    'This is commonly used for large-scale training in writing, code generation, or knowledge bases.\n\n' +
+                    'The jsonl format file can be referenced at https://github.com/Abel2076/json2binidx_tool/blob/main/sample.jsonl.\n' +
+                    'You can also write it similar to OpenAI\'s playground format, as shown in https://platform.openai.com/playground/p/default-chat.\n' +
+                    'Even for multi-turn conversations, they must be written in a single line using `\\n` to indicate line breaks. ' +
+                    'If they are different dialogues or topics, they should be written in separate lines.')} />
                <ToolTipButton desc={t('Open Folder')} icon={<Folder20Regular />} onClick={() => {
                  OpenFileFolder(dataParams.dataPath, false);
                }} />
              </div>
-                } />
              <div className="flex gap-2 items-center">
                {t('Vocab Path')}
                <Input className="grow" style={{ minWidth: 0 }} value={dataParams.vocabPath}
                  onChange={(e, data) => {
                    setDataParams({ vocabPath: data.value });
                  }} />
-                <Button appearance="secondary" size="large" onClick={async () => {
+                <Button appearance="secondary" onClick={async () => {
                  const ok = await checkDependencies(navigate);
                  if (!ok)
                    return;
-                  const outputPrefix = './finetune/json2binidx_tool/data/' + dataParams.dataPath.split(/[\/\\]/).pop()!.split('.')[0];
+                  const outputPrefix = './finetune/json2binidx_tool/data/' +
+                    dataParams.dataPath.replace(/[\/\\]$/, '').split(/[\/\\]/).pop()!.split('.')[0];
                  ConvertData(commonStore.settings.customPythonPath, dataParams.dataPath, outputPrefix, dataParams.vocabPath).then(async () => {
                    if (!await FileExists(outputPrefix + '_text_document.idx')) {
                      toast(t('Failed to convert data') + ' - ' + await GetPyError(), { type: 'error' });