perf(build): override Tavily's default model name; Add separate script for full building control

This commit is contained in:
AnotiaWang
2025-02-13 14:48:06 +08:00
parent 1d4d1e8113
commit 2b4ace5c7d
4 changed files with 109 additions and 50 deletions

106
scripts/build.js Normal file
View File

@ -0,0 +1,106 @@
/**
* build.js
* The project uses js-tiktoken to cut down text to avoid exceeding LLM's context length.
* However, js-tiktoken has all encodings hard-coded into its code (node_modules/js-tiktoken/dist/index.cjs and node_modules/js-tiktoken/dist/index.js),
* which makes the build output quite large.
*
* I've analyzed the import dependencies and found that the
* `@tavily/core` SDK also uses js-tiktoken's `cl100k_base` encoding.
* So basically only `cl100k_base` (for tavily) and `o200k_base` (for our project) are needed.
*
* So I wrote this script to override the encodings in `@tavily/core` to `o200k_base`
* and clean up unused js-tiktoken encodings in the build output,
* making the build output smaller by about 2 MB.
*/
import { execSync } from 'child_process';
import fs from 'fs';
// Change Tavily SDK's js-tiktoken to use `o200k_base`
const tavilyFilePaths = ['node_modules/@tavily/core/dist/index.js', 'node_modules/@tavily/core/dist/index.mjs'];
const tiktokenFilePaths = ['node_modules/js-tiktoken/dist/index.cjs', 'node_modules/js-tiktoken/dist/index.js'];
function cleanup() {
for (const filePath of tavilyFilePaths) {
if (fs.existsSync(filePath)) {
// Create a backup
const backupPath = filePath + '.bak';
fs.copyFileSync(filePath, backupPath);
let content = fs.readFileSync(filePath, 'utf-8');
content = content.replace(
`var DEFAULT_MODEL_ENCODING = "gpt-3.5-turbo"`,
'var DEFAULT_MODEL_ENCODING = "gpt-4o"'
);
fs.writeFileSync(filePath, content, 'utf-8');
console.log(`Successfully overrided Tavily SDK's js-tiktoken model name in ${filePath}`);
}
}
for (const filePath of tiktokenFilePaths) {
if (fs.existsSync(filePath)) {
// Create a backup
const backupPath = filePath + '.bak';
fs.copyFileSync(filePath, backupPath);
let content = fs.readFileSync(filePath, 'utf-8');
// Clear all encodings except for o200k_base
const patterns = [
['gpt2_default', 'var gpt2_default = { "explicit_n_vocab": 50257, "pat_str":', 'var gpt2_default = {}'],
['p50k_base_default', 'var p50k_base_default = { "explicit_n_vocab":', 'var p50k_base_default = {}'],
['p50k_edit_default', 'var p50k_edit_default = { "pat_str":', 'var p50k_edit_default = {}'],
['r50k_base_default', 'var r50k_base_default = { "explicit_n_vocab":', 'var r50k_base_default = {}'],
['cl100k_base', 'var cl100k_base_default = { "pat_str":', 'var cl100k_base_default = {}'],
];
for (const [name, searchStr, replaceStr] of patterns) {
const startIdx = content.indexOf(searchStr);
if (startIdx === -1) continue;
// Find the end of line
const endIdx = content.indexOf('\n', startIdx);
if (endIdx === -1) continue;
// Replace the line
content = content.slice(0, startIdx) + replaceStr + content.slice(endIdx);
}
// Write back
fs.writeFileSync(filePath, content, 'utf-8');
console.log(`Successfully cleaned up js-tiktoken encodings in ${filePath}`);
}
}
}
function restore() {
for (const filePath of tavilyFilePaths) {
if (fs.existsSync(`${filePath}.bak`)) {
console.log(`Restoring Tavily SDK's js-tiktoken encodings in ${filePath}`);
fs.renameSync(`${filePath}.bak`, filePath);
}
}
for (const filePath of tiktokenFilePaths) {
if (fs.existsSync(`${filePath}.bak`)) {
console.log(`Restoring js-tiktoken encodings in ${filePath}`);
fs.renameSync(`${filePath}.bak`, filePath);
}
}
}
function build() {
try {
let command = 'build'
if (process.argv.includes('--generate')) {
command = 'generate'
}
cleanup()
execSync(`pnpm ${command}`, { stdio: 'inherit' })
} catch (error) {
console.error(error)
} finally {
restore()
}
}
build()