fix jsonl data when using directory as training data

This commit is contained in:
josc146 2023-07-09 11:31:07 +08:00
parent 07d89e3eeb
commit 5b1a9448e6

View File

@ -1,6 +1,7 @@
package backend_golang package backend_golang
import ( import (
"encoding/json"
"errors" "errors"
"os" "os"
"os/exec" "os/exec"
@ -59,30 +60,17 @@ func (a *App) ConvertData(python string, input string, outputPrefix string, voca
if file.IsDir() || !strings.HasSuffix(file.Name(), ".txt") { if file.IsDir() || !strings.HasSuffix(file.Name(), ".txt") {
continue continue
} }
txtFile, err := os.Open(input + "/" + file.Name()) textContent, err := os.ReadFile(input + "/" + file.Name())
if err != nil { if err != nil {
return "", err return "", err
} }
defer txtFile.Close() textJson, err := json.Marshal(map[string]string{"text": string(textContent)})
jsonlFile.WriteString("{\"text\": \"") if err != nil {
buf := make([]byte, 1024) return "", err
for { }
n, err := txtFile.Read(buf) if _, err := jsonlFile.WriteString(string(textJson) + "\n"); err != nil {
if err != nil { return "", err
break
}
// regex replace \r\n \n \r with \\n
jsonlFile.WriteString(
strings.ReplaceAll(
strings.ReplaceAll(
strings.ReplaceAll(
strings.ReplaceAll(string(buf[:n]),
"\r\n", "\\n"),
"\n", "\\n"),
"\r", "\\n"),
"\n\n", "\\n"))
} }
jsonlFile.WriteString("\"}\n")
} }
input = outputPrefix + ".jsonl" input = outputPrefix + ".jsonl"
} else if err != nil { } else if err != nil {