diff --git a/Dockerfile b/Dockerfile index 322847f..b42126d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,12 +5,10 @@ WORKDIR /app COPY package.json pnpm-lock.yaml ./ RUN npm i -g --force pnpm@9 RUN pnpm install --frozen-lockfile -# Remove unused encodings from `js-tiktoken` -RUN node scripts/cleanup-js-tiktoken.js COPY . . ENV NODE_OPTIONS="--max_old_space_size=2048" -RUN pnpm build +RUN pnpm build:optimize FROM node:20-alpine AS runner diff --git a/package.json b/package.json index 3c258cc..0b9985e 100644 --- a/package.json +++ b/package.json @@ -4,8 +4,10 @@ "type": "module", "scripts": { "build": "nuxt build", + "build:optimize": "node scripts/build.js", "dev": "nuxt dev", "generate": "nuxt generate", + "generate:optimize": "node scripts/build.js --generate", "preview": "nuxt preview", "postinstall": "nuxt prepare" }, diff --git a/scripts/build.js b/scripts/build.js new file mode 100644 index 0000000..bd78183 --- /dev/null +++ b/scripts/build.js @@ -0,0 +1,106 @@ +/** + * build.js + * The project uses js-tiktoken to cut down text to avoid exceeding LLM's context length. + * However, js-tiktoken has all encodings hard-coded into its code (node_modules/js-tiktoken/dist/index.cjs and node_modules/js-tiktoken/dist/index.js), + * which makes the build output quite large. + * + * I've analyzed the import dependencies and found that the + * `@tavily/core` SDK also uses js-tiktoken's `cl100k_base` encoding. + * So basically only `cl100k_base` (for tavily) and `o200k_base` (for our project) are needed. + * + * So I wrote this script to override the encodings in `@tavily/core` to `o200k_base` + * and clean up unused js-tiktoken encodings in the build output, + * making the build output smaller by about 2 MB. +*/ + +import { execSync } from 'child_process'; +import fs from 'fs'; + +// Change Tavily SDK's js-tiktoken to use `o200k_base` +const tavilyFilePaths = ['node_modules/@tavily/core/dist/index.js', 'node_modules/@tavily/core/dist/index.mjs']; +const tiktokenFilePaths = ['node_modules/js-tiktoken/dist/index.cjs', 'node_modules/js-tiktoken/dist/index.js']; + +function cleanup() { + for (const filePath of tavilyFilePaths) { + if (fs.existsSync(filePath)) { + // Create a backup + const backupPath = filePath + '.bak'; + fs.copyFileSync(filePath, backupPath); + + let content = fs.readFileSync(filePath, 'utf-8'); + content = content.replace( + `var DEFAULT_MODEL_ENCODING = "gpt-3.5-turbo"`, + 'var DEFAULT_MODEL_ENCODING = "gpt-4o"' + ); + fs.writeFileSync(filePath, content, 'utf-8'); + console.log(`Successfully overrided Tavily SDK's js-tiktoken model name in ${filePath}`); + } + } + for (const filePath of tiktokenFilePaths) { + if (fs.existsSync(filePath)) { + // Create a backup + const backupPath = filePath + '.bak'; + fs.copyFileSync(filePath, backupPath); + + let content = fs.readFileSync(filePath, 'utf-8'); + + // Clear all encodings except for o200k_base + const patterns = [ + ['gpt2_default', 'var gpt2_default = { "explicit_n_vocab": 50257, "pat_str":', 'var gpt2_default = {}'], + ['p50k_base_default', 'var p50k_base_default = { "explicit_n_vocab":', 'var p50k_base_default = {}'], + ['p50k_edit_default', 'var p50k_edit_default = { "pat_str":', 'var p50k_edit_default = {}'], + ['r50k_base_default', 'var r50k_base_default = { "explicit_n_vocab":', 'var r50k_base_default = {}'], + ['cl100k_base', 'var cl100k_base_default = { "pat_str":', 'var cl100k_base_default = {}'], + ]; + + for (const [name, searchStr, replaceStr] of patterns) { + const startIdx = content.indexOf(searchStr); + if (startIdx === -1) continue; + + // Find the end of line + const endIdx = content.indexOf('\n', startIdx); + if (endIdx === -1) continue; + + // Replace the line + content = content.slice(0, startIdx) + replaceStr + content.slice(endIdx); + } + + // Write back + fs.writeFileSync(filePath, content, 'utf-8'); + console.log(`Successfully cleaned up js-tiktoken encodings in ${filePath}`); + } + } +} + +function restore() { + for (const filePath of tavilyFilePaths) { + if (fs.existsSync(`${filePath}.bak`)) { + console.log(`Restoring Tavily SDK's js-tiktoken encodings in ${filePath}`); + fs.renameSync(`${filePath}.bak`, filePath); + } + } + + for (const filePath of tiktokenFilePaths) { + if (fs.existsSync(`${filePath}.bak`)) { + console.log(`Restoring js-tiktoken encodings in ${filePath}`); + fs.renameSync(`${filePath}.bak`, filePath); + } + } +} + +function build() { + try { + let command = 'build' + if (process.argv.includes('--generate')) { + command = 'generate' + } + cleanup() + execSync(`pnpm ${command}`, { stdio: 'inherit' }) + } catch (error) { + console.error(error) + } finally { + restore() + } +} + +build() diff --git a/scripts/cleanup-js-tiktoken.js b/scripts/cleanup-js-tiktoken.js deleted file mode 100644 index 50b5189..0000000 --- a/scripts/cleanup-js-tiktoken.js +++ /dev/null @@ -1,47 +0,0 @@ -/** - * cleanup-js-tiktoken.js - * The project uses js-tiktoken to cut down text to avoid exceeding LLM's context length. - * However, js-tiktoken has all encodings hard-coded into its code (node_modules/js-tiktoken/dist/index.cjs and node_modules/js-tiktoken/dist/index.js), - * which makes the build output quite large. - * - * I've analyzed the import dependencies and found that the - * `@tavily/core` SDK also uses js-tiktoken. So basically - * only `cl100k_base` (for tavily) and `o200k_base` (for our project) are needed. - * - * So I wrote this script to clean up unused js-tiktoken encodings in the build output, - * making the build output smaller by about 2 MB. - */ - -import fs from 'fs'; - -const filePaths = ['node_modules/js-tiktoken/dist/index.cjs', 'node_modules/js-tiktoken/dist/index.js']; - -for (const filePath of filePaths) { - if (fs.existsSync(filePath)) { - let content = fs.readFileSync(filePath, 'utf-8'); - - // 保留 cl100k_base 和 o200k_base,清空其他编码器 - const patterns = [ - ['gpt2_default', 'var gpt2_default = { "explicit_n_vocab": 50257, "pat_str":', 'var gpt2_default = {}'], - ['p50k_base_default', 'var p50k_base_default = { "explicit_n_vocab":', 'var p50k_base_default = {}'], - ['p50k_edit_default', 'var p50k_edit_default = { "pat_str":', 'var p50k_edit_default = {}'], - ['r50k_base_default', 'var r50k_base_default = { "explicit_n_vocab":', 'var r50k_base_default = {}'], - ]; - - for (const [name, searchStr, replaceStr] of patterns) { - const startIdx = content.indexOf(searchStr); - if (startIdx === -1) continue; - - // 找到变量定义的结束位置 - const endIdx = content.indexOf('\n', startIdx); - if (endIdx === -1) continue; - - // 替换整个变量定义 - content = content.slice(0, startIdx) + replaceStr + content.slice(endIdx); - } - - // 写回文件 - fs.writeFileSync(filePath, content, 'utf-8'); - console.log(`Successfully cleaned up js-tiktoken encodings in ${filePath}`); - } -} \ No newline at end of file