node+sqlite 全文搜索初步尝鲜

李恒道 · 发表于 2022-10-3 17:20:53

最近在做一个比较简单的小工具
需要根据关键字进行搜索
一开始没什么思路
所以就问脚本猫王一之了
一开始想尝试使用es来着
但是研究一下感觉较为笨重
npm库中的full text search也没有翻到比较好的库
https://www.npmjs.com/search?q=full-text%20search
lunr之流看文档是不太符合大量写入写出的...
而且看前人来说反馈中文支持也不是特别好
更倾向于浏览器端的固定读入读出之类的搜索
选择来选择去
最后选择了sqlite 的全文搜索
但是观察
https://www.zhihu.com/question/37114296
也比较微妙...中文需要空格，并且没法模糊匹配
但是回答里茶树大神安利了一个自己的库
所以决定动手尝试一下

开始尝试

先拉一份他的库到本地
看看例子
https://github.com/wangfenjin/simple/tree/master/examples/node
这里我按他的配置失败了
于是从头理了一下

# update dependency
ncu -u
# install dependency
npm install
# download lib from github
npm run download
# run example using downloaded lib
npm run p
# run example and set the ext_path and dict_path
npm run p -- --ext_path=/path/to/libsimple/ --dict_path=/path/to/dict/
# remove build folder
npm run clean

ncu我们不用管
npm install安装依赖
npm run download
运行download脚本
"download": "just install",
是调用just install
因为实在不太熟悉这个库...
看样子应该是调用了just-task.js的文件内容
代码是

task('install', () => {
  return new Promise((resolve, reject) => {
    const localPath = path.join(__dirname, 'lib')
    var platform = process.env.npm_config_target_platform || process.platform
    logger.info(`[install] Target platform: ${platform}`)
    if (platform === 'darwin') {
      platform = 'osx';
    } else if (platform === 'win32') {
      platform = 'windows';
    }
    var arch = process.env.npm_config_target_arch || process.arch
    logger.info(`[install] Target arch: ${arch}`)
    const downloadUrl = `https://github.com/wangfenjin/simple/releases/download/v0.2.0-alpha/libsimple-linux-ubuntu-18.04.zip`
    logger.info(`[install] Download prebuilt binaries from ${downloadUrl}`)
    download(downloadUrl, localPath, {
        extract: true, strip: 1
    }).then(() => {
      resolve()
    }).catch(err => {
      logger.warn(`[install] Failed to download package from: ${downloadUrl}, err: ${err}`)
      reject()
    })
  })
})

前边大概配置了一下环境
然后设置了一个url
最后再调用download进行下载
但是downloadUrl是固定的
会下载libsimple-linux-ubuntu-18.04.zip
因为我是window系统所以导致出现问题
找到错误原因了！
去https://github.com/wangfenjin/simple/releases
找一下对应的编译文件替换下地址，然后删除依赖啥的重新跑一下
然后运行npm run p正确跑通
demo还是非常干净的，给大佬打call！
https://github.com/wangfenjin/simple/blob/master/examples/node/node-sqlite3.js
我大概注释一下

var path = require("path");
var sqlite3 = require('sqlite3').verbose();
//设置为堆栈模式
var db = new sqlite3.Database(':memory:');
//在内存中创造一个sql数据

const process = require( 'process' );
//读取参数函数
const argv = key => {
    // Return true if the key exists and a value is defined
    if ( process.argv.includes( `--${ key }` ) ) return true;
    const value = process.argv.find( element => element.startsWith( `--${ key }=` ) );
    // Return null if the key does not exist and a value is not defined
    if ( !value ) return null;
    return value.replace( `--${ key }=` , '' );
}
//对数据库命令进行序列化，解决异步顺序冲突问题
db.serialize(function() {
    var ext_path = path.resolve("./lib/");//读取路径
    if (argv('ext_path')) {//判断是否存在，存在则更改路径
        ext_path = argv('ext_path');
    }
    var dict_path = path.join(ext_path, "dict");//同理
    if (argv('dict_path')) {
        dict_path = argv('dict_path');
    }
    console.log("extension path: " + ext_path + ", dict path: " + dict_path);//输出一下
    // load extension
    var platform = process.env.npm_config_target_platform || process.platform
    //根据平台加载不同的拓展
    if (platform === 'win32') {
      db.loadExtension(path.join(ext_path, "simple"));
    } else {
      db.loadExtension(path.join(ext_path, "libsimple"));
    }
    // set the jieba dict file path
    //设置结巴字典路径
    db.run("select jieba_dict(?)", dict_path);
    // create table
    //创建fts5的全文搜索虚表
    //create virtual table t1 是在创建名为t1的虚表
    //x表示列名，使用simple作为分词器
    db.run("CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'simple')");
    // insert some data
    //插入一些数据到t1表的x列，值为阿巴阿巴阿巴
    db.run("insert into t1(x) values ('周杰伦 Jay Chou:我已分不清，你是友情还是错过的爱情'), ('周杰伦 Jay Chou:最美的不是下雨天，是曾与你躲过雨的屋檐'), ('I love China! 我爱中国！我是中华人民共和国公民！'), ('@English &special _characters.\"''bacon-&and''-eggs%')");

    //where x match simple_query('zjl') 匹配x列内容为simple_query函数内填写查询的内容
    //rowid as id将rowid变为id，将simple_highlight(t1, 0, '[', ']')变为info 
    //simple_highlight(t1, 0, '[', ']')为连续高亮匹配的词语
    //as info from t1 从表t1中搜寻

    db.each("select rowid as id, simple_highlight(t1, 0, '[', ']') as info from t1 where x match simple_query('zjl')", function(err, row) {
        console.log(row.id + ": " + row.info);
    });
    // will match 中国 and 中华人民共和国
    db.each("select rowid as id, simple_highlight(t1, 0, '[', ']') as info from t1 where x match simple_query('中国')", function(err, row) {
        console.log(row.id + ": " + row.info);
    });
    // will match 中国 but not 中华人民共和国
    db.each("select rowid as id, simple_highlight(t1, 0, '[', ']') as info from t1 where x match jieba_query('中国')", function(err, row) {
        console.log(row.id + ": " + row.info);
    });
});
//关闭数据库
db.close();

一点小补充

each函数查询多条数据
对每个检索的行都调用一次回调
执行完后如果存在complete函数则回调complete函数
不存在就不调了

功能

这个封装的函数还是蛮多的
我直接抄github了

simple tokenizer 支持中文和拼音的分词，并且可通过开关控制是否需要支持拼音
simple_query() 函数实现自动组装 match query 的功能，用户不用学习 fts5 query 的语法
simple_highlight() 实现连续高亮 match 的词汇，与 sqlite 自带的 highlight 类似，但是 simple_highlight 实现了连续 match 的词汇分到同一组的逻辑，理论上用户更需要这样
simple_highlight_pos() 实现返回 match 的词汇位置，用户可以自行决定怎么使用
simple_snippet() 实现截取 match 片段的功能，与 sqlite 自带的 snippet 功能类似，同样是增强连续 match 的词汇分到同一组的逻辑
jieba_query() 实现jieba分词的效果，在索引不变的情况下，可以实现更精准的匹配。可以通过 -DSIMPLE_WITH_JIEBA=OFF 关掉结巴分词的功能 #35
jieba_dict() 指定 dict 的目录，只需要调用一次，需要在调用 jieba_query() 之前指定。

总结

尝试了一下这个库还是蛮好用的...
比较轻量
匹配的文本还是挺符合需求的
大佬牛逼！(破音

懒男孩 · 发表于 2022-10-3 17:43:01

脚本猫王一之

李恒道 · 发表于 2022-10-3 18:02:12

懒男孩发表于 2022-10-3 17:43
脚本猫王一之

猫王！

王一之 · 发表于 2022-10-3 21:53:12

为什么不用mysql？这是啥需求