perf(build): override Tavily's default model name; Add separate script for full building control

2025-02-13 14:48:06 +08:00
parent 1d4d1e8113
commit 2b4ace5c7d
4 changed files with 109 additions and 50 deletions
--- a/4
+++ b/4
@ -5,12 +5,10 @@ WORKDIR /app
 COPY package.json pnpm-lock.yaml ./
 RUN npm i -g --force pnpm@9
 RUN pnpm install --frozen-lockfile
-# Remove unused encodings from `js-tiktoken`
-RUN node scripts/cleanup-js-tiktoken.js

 COPY . .
 ENV NODE_OPTIONS="--max_old_space_size=2048"
-RUN pnpm build
+RUN pnpm build:optimize

 FROM node:20-alpine AS runner

--- a/package.json
+++ b/package.json
@ -4,8 +4,10 @@
  "type": "module",
  "scripts": {
    "build": "nuxt build",
+    "build:optimize": "node scripts/build.js",
    "dev": "nuxt dev",
    "generate": "nuxt generate",
+    "generate:optimize": "node scripts/build.js --generate",
    "preview": "nuxt preview",
    "postinstall": "nuxt prepare"
  },
--- a/scripts/build.js
+++ b/scripts/build.js
@ -0,0 +1,106 @@
+/**
+ * build.js
+ * The project uses js-tiktoken to cut down text to avoid exceeding LLM's context length.
+ * However, js-tiktoken has all encodings hard-coded into its code (node_modules/js-tiktoken/dist/index.cjs and node_modules/js-tiktoken/dist/index.js),
+ * which makes the build output quite large.
+ * 
+ * I've analyzed the import dependencies and found that the
+ * `@tavily/core` SDK also uses js-tiktoken's `cl100k_base` encoding.
+ * So basically only `cl100k_base` (for tavily) and `o200k_base` (for our project) are needed.
+ * 
+ * So I wrote this script to override the encodings in `@tavily/core` to `o200k_base`
+ * and clean up unused js-tiktoken encodings in the build output,
+ * making the build output smaller by about 2 MB.
+*/
+
+import { execSync } from 'child_process';
+import fs from 'fs';
+
+// Change Tavily SDK's js-tiktoken to use `o200k_base`
+const tavilyFilePaths = ['node_modules/@tavily/core/dist/index.js', 'node_modules/@tavily/core/dist/index.mjs'];
+const tiktokenFilePaths = ['node_modules/js-tiktoken/dist/index.cjs', 'node_modules/js-tiktoken/dist/index.js'];
+
+function cleanup() {
+  for (const filePath of tavilyFilePaths) {
+    if (fs.existsSync(filePath)) {
+      // Create a backup
+      const backupPath = filePath + '.bak';
+      fs.copyFileSync(filePath, backupPath);
+
+      let content = fs.readFileSync(filePath, 'utf-8');
+      content = content.replace(
+        `var DEFAULT_MODEL_ENCODING = "gpt-3.5-turbo"`,
+        'var DEFAULT_MODEL_ENCODING = "gpt-4o"'
+      );
+      fs.writeFileSync(filePath, content, 'utf-8');
+      console.log(`Successfully overrided Tavily SDK's js-tiktoken model name in ${filePath}`);
+    }
+  }
+  for (const filePath of tiktokenFilePaths) {
+    if (fs.existsSync(filePath)) {
+      // Create a backup
+      const backupPath = filePath + '.bak';
+      fs.copyFileSync(filePath, backupPath);
+
+      let content = fs.readFileSync(filePath, 'utf-8');
+
+      // Clear all encodings except for o200k_base
+      const patterns = [
+        ['gpt2_default', 'var gpt2_default = { "explicit_n_vocab": 50257, "pat_str":', 'var gpt2_default = {}'],
+        ['p50k_base_default', 'var p50k_base_default = { "explicit_n_vocab":', 'var p50k_base_default = {}'],
+        ['p50k_edit_default', 'var p50k_edit_default = { "pat_str":', 'var p50k_edit_default = {}'],
+        ['r50k_base_default', 'var r50k_base_default = { "explicit_n_vocab":', 'var r50k_base_default = {}'],
+        ['cl100k_base', 'var cl100k_base_default = { "pat_str":', 'var cl100k_base_default = {}'],
+      ];
+
+      for (const [name, searchStr, replaceStr] of patterns) {
+        const startIdx = content.indexOf(searchStr);
+        if (startIdx === -1) continue;
+
+        // Find the end of line
+        const endIdx = content.indexOf('\n', startIdx);
+        if (endIdx === -1) continue;
+
+        // Replace the line
+        content = content.slice(0, startIdx) + replaceStr + content.slice(endIdx);
+      }
+
+      // Write back
+      fs.writeFileSync(filePath, content, 'utf-8');
+      console.log(`Successfully cleaned up js-tiktoken encodings in ${filePath}`);
+    }
+  }
+}
+
+function restore() {
+  for (const filePath of tavilyFilePaths) {
+    if (fs.existsSync(`${filePath}.bak`)) {
+      console.log(`Restoring Tavily SDK's js-tiktoken encodings in ${filePath}`);
+      fs.renameSync(`${filePath}.bak`, filePath);
+    }
+  }
+
+  for (const filePath of tiktokenFilePaths) {
+    if (fs.existsSync(`${filePath}.bak`)) {
+      console.log(`Restoring js-tiktoken encodings in ${filePath}`);
+      fs.renameSync(`${filePath}.bak`, filePath);
+    }
+  }
+}
+
+function build() {
+  try {
+    let command = 'build'
+    if (process.argv.includes('--generate')) {
+      command = 'generate'
+    }
+    cleanup()
+    execSync(`pnpm ${command}`, { stdio: 'inherit' })
+  } catch (error) {
+    console.error(error)
+  } finally {
+    restore()
+  }
+}
+
+build()
--- a/scripts/cleanup-js-tiktoken.js
+++ b/scripts/cleanup-js-tiktoken.js
@ -1,47 +0,0 @@
-/**
- * cleanup-js-tiktoken.js
- * The project uses js-tiktoken to cut down text to avoid exceeding LLM's context length.
- * However, js-tiktoken has all encodings hard-coded into its code (node_modules/js-tiktoken/dist/index.cjs and node_modules/js-tiktoken/dist/index.js),
- * which makes the build output quite large.
- * 
- * I've analyzed the import dependencies and found that the
- * `@tavily/core` SDK also uses js-tiktoken. So basically 
- * only `cl100k_base` (for tavily) and `o200k_base` (for our project) are needed.
- * 
- * So I wrote this script to clean up unused js-tiktoken encodings in the build output,
- * making the build output smaller by about 2 MB.
- */
-
-import fs from 'fs';
-
-const filePaths = ['node_modules/js-tiktoken/dist/index.cjs', 'node_modules/js-tiktoken/dist/index.js'];
-
-for (const filePath of filePaths) {
-  if (fs.existsSync(filePath)) {
-    let content = fs.readFileSync(filePath, 'utf-8');
-
-    // 保留 cl100k_base 和 o200k_base，清空其他编码器
-    const patterns = [
-      ['gpt2_default', 'var gpt2_default = { "explicit_n_vocab": 50257, "pat_str":', 'var gpt2_default = {}'],
-      ['p50k_base_default', 'var p50k_base_default = { "explicit_n_vocab":', 'var p50k_base_default = {}'],
-      ['p50k_edit_default', 'var p50k_edit_default = { "pat_str":', 'var p50k_edit_default = {}'],
-      ['r50k_base_default', 'var r50k_base_default = { "explicit_n_vocab":', 'var r50k_base_default = {}'],
-    ];
-
-    for (const [name, searchStr, replaceStr] of patterns) {
-      const startIdx = content.indexOf(searchStr);
-      if (startIdx === -1) continue;
-
-      // 找到变量定义的结束位置
-      const endIdx = content.indexOf('\n', startIdx);
-      if (endIdx === -1) continue;
-
-      // 替换整个变量定义
-      content = content.slice(0, startIdx) + replaceStr + content.slice(endIdx);
-    }
-
-    // 写回文件
-    fs.writeFileSync(filePath, content, 'utf-8');
-    console.log(`Successfully cleaned up js-tiktoken encodings in ${filePath}`);
-  }
-}