style: prettier format
This commit is contained in:
@ -1,60 +1,60 @@
|
||||
interface TextSplitterParams {
|
||||
chunkSize: number;
|
||||
chunkSize: number
|
||||
|
||||
chunkOverlap: number;
|
||||
chunkOverlap: number
|
||||
}
|
||||
|
||||
abstract class TextSplitter implements TextSplitterParams {
|
||||
chunkSize = 1000;
|
||||
chunkOverlap = 200;
|
||||
chunkSize = 1000
|
||||
chunkOverlap = 200
|
||||
|
||||
constructor(fields?: Partial<TextSplitterParams>) {
|
||||
this.chunkSize = fields?.chunkSize ?? this.chunkSize;
|
||||
this.chunkOverlap = fields?.chunkOverlap ?? this.chunkOverlap;
|
||||
this.chunkSize = fields?.chunkSize ?? this.chunkSize
|
||||
this.chunkOverlap = fields?.chunkOverlap ?? this.chunkOverlap
|
||||
if (this.chunkOverlap >= this.chunkSize) {
|
||||
throw new Error('Cannot have chunkOverlap >= chunkSize');
|
||||
throw new Error('Cannot have chunkOverlap >= chunkSize')
|
||||
}
|
||||
}
|
||||
|
||||
abstract splitText(text: string): string[];
|
||||
abstract splitText(text: string): string[]
|
||||
|
||||
createDocuments(texts: string[]): string[] {
|
||||
const documents: string[] = [];
|
||||
const documents: string[] = []
|
||||
for (let i = 0; i < texts.length; i += 1) {
|
||||
const text = texts[i];
|
||||
const text = texts[i]
|
||||
for (const chunk of this.splitText(text!)) {
|
||||
documents.push(chunk);
|
||||
documents.push(chunk)
|
||||
}
|
||||
}
|
||||
return documents;
|
||||
return documents
|
||||
}
|
||||
|
||||
splitDocuments(documents: string[]): string[] {
|
||||
return this.createDocuments(documents);
|
||||
return this.createDocuments(documents)
|
||||
}
|
||||
|
||||
private joinDocs(docs: string[], separator: string): string | null {
|
||||
const text = docs.join(separator).trim();
|
||||
return text === '' ? null : text;
|
||||
const text = docs.join(separator).trim()
|
||||
return text === '' ? null : text
|
||||
}
|
||||
|
||||
mergeSplits(splits: string[], separator: string): string[] {
|
||||
const docs: string[] = [];
|
||||
const currentDoc: string[] = [];
|
||||
let total = 0;
|
||||
const docs: string[] = []
|
||||
const currentDoc: string[] = []
|
||||
let total = 0
|
||||
for (const d of splits) {
|
||||
const _len = d.length;
|
||||
const _len = d.length
|
||||
if (total + _len >= this.chunkSize) {
|
||||
if (total > this.chunkSize) {
|
||||
console.warn(
|
||||
`Created a chunk of size ${total}, +
|
||||
which is longer than the specified ${this.chunkSize}`,
|
||||
);
|
||||
)
|
||||
}
|
||||
if (currentDoc.length > 0) {
|
||||
const doc = this.joinDocs(currentDoc, separator);
|
||||
const doc = this.joinDocs(currentDoc, separator)
|
||||
if (doc !== null) {
|
||||
docs.push(doc);
|
||||
docs.push(doc)
|
||||
}
|
||||
// Keep on popping if:
|
||||
// - we have a larger chunk than in the chunk overlap
|
||||
@ -63,81 +63,81 @@ which is longer than the specified ${this.chunkSize}`,
|
||||
total > this.chunkOverlap ||
|
||||
(total + _len > this.chunkSize && total > 0)
|
||||
) {
|
||||
total -= currentDoc[0]!.length;
|
||||
currentDoc.shift();
|
||||
total -= currentDoc[0]!.length
|
||||
currentDoc.shift()
|
||||
}
|
||||
}
|
||||
}
|
||||
currentDoc.push(d);
|
||||
total += _len;
|
||||
currentDoc.push(d)
|
||||
total += _len
|
||||
}
|
||||
const doc = this.joinDocs(currentDoc, separator);
|
||||
const doc = this.joinDocs(currentDoc, separator)
|
||||
if (doc !== null) {
|
||||
docs.push(doc);
|
||||
docs.push(doc)
|
||||
}
|
||||
return docs;
|
||||
return docs
|
||||
}
|
||||
}
|
||||
|
||||
export interface RecursiveCharacterTextSplitterParams
|
||||
extends TextSplitterParams {
|
||||
separators: string[];
|
||||
separators: string[]
|
||||
}
|
||||
|
||||
export class RecursiveCharacterTextSplitter
|
||||
extends TextSplitter
|
||||
implements RecursiveCharacterTextSplitterParams
|
||||
{
|
||||
separators: string[] = ['\n\n', '\n', '.', ',', '>', '<', ' ', ''];
|
||||
separators: string[] = ['\n\n', '\n', '.', ',', '>', '<', ' ', '']
|
||||
|
||||
constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>) {
|
||||
super(fields);
|
||||
this.separators = fields?.separators ?? this.separators;
|
||||
super(fields)
|
||||
this.separators = fields?.separators ?? this.separators
|
||||
}
|
||||
|
||||
splitText(text: string): string[] {
|
||||
const finalChunks: string[] = [];
|
||||
const finalChunks: string[] = []
|
||||
|
||||
// Get appropriate separator to use
|
||||
let separator: string = this.separators[this.separators.length - 1]!;
|
||||
let separator: string = this.separators[this.separators.length - 1]!
|
||||
for (const s of this.separators) {
|
||||
if (s === '') {
|
||||
separator = s;
|
||||
break;
|
||||
separator = s
|
||||
break
|
||||
}
|
||||
if (text.includes(s)) {
|
||||
separator = s;
|
||||
break;
|
||||
separator = s
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Now that we have the separator, split the text
|
||||
let splits: string[];
|
||||
let splits: string[]
|
||||
if (separator) {
|
||||
splits = text.split(separator);
|
||||
splits = text.split(separator)
|
||||
} else {
|
||||
splits = text.split('');
|
||||
splits = text.split('')
|
||||
}
|
||||
|
||||
// Now go merging things, recursively splitting longer texts.
|
||||
let goodSplits: string[] = [];
|
||||
let goodSplits: string[] = []
|
||||
for (const s of splits) {
|
||||
if (s.length < this.chunkSize) {
|
||||
goodSplits.push(s);
|
||||
goodSplits.push(s)
|
||||
} else {
|
||||
if (goodSplits.length) {
|
||||
const mergedText = this.mergeSplits(goodSplits, separator);
|
||||
finalChunks.push(...mergedText);
|
||||
goodSplits = [];
|
||||
const mergedText = this.mergeSplits(goodSplits, separator)
|
||||
finalChunks.push(...mergedText)
|
||||
goodSplits = []
|
||||
}
|
||||
const otherInfo = this.splitText(s);
|
||||
finalChunks.push(...otherInfo);
|
||||
const otherInfo = this.splitText(s)
|
||||
finalChunks.push(...otherInfo)
|
||||
}
|
||||
}
|
||||
if (goodSplits.length) {
|
||||
const mergedText = this.mergeSplits(goodSplits, separator);
|
||||
finalChunks.push(...mergedText);
|
||||
const mergedText = this.mergeSplits(goodSplits, separator)
|
||||
finalChunks.push(...mergedText)
|
||||
}
|
||||
return finalChunks;
|
||||
return finalChunks
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user