Commit de90e53f authored by Jarrod's avatar Jarrod 💬

Update README.md, package.json, lib/index.js, lib/parser.js, parser.md files

parent df2a2912
Pipeline #1318 passed with stages
in 52 seconds
## @nahanil/cedict-js
## @nahanil/zhdict-lite
Fallback data source for [zhdict.net](https://zhdict.net/)
```bash
npm install --save @nahanil/cedict-js
npm install --save @nahanil/zhdict-lite
# OR
yarn add @nahanil/cedict-js
yarn add @nahanil/zhdict-lite
```
```js
const cedict = require('@nahanil/cedict-js')
const dict = require('@nahanil/zhdict-lite')
```
### Search for a word or character
```js
let results = await cedict.search('水')
let results = await dict.search('水')
/*
* Always returns an array of results
[
......@@ -37,7 +37,7 @@ let results = await cedict.search('水')
]
*/
let results = await cedict.search('你好')
let results = await dict.search('你好')
/*
[
{
......@@ -62,7 +62,7 @@ let results = await cedict.search('你好')
### Search for a word or character by radical
```js
let results = await cedict.searchByRadical('水')
let results = await dict.searchByRadical('水')
/*
* Always returns an array of results
[
......
......@@ -3,17 +3,16 @@ const Op = require('sequelize').Op
const _ = require('lodash')
/**
* @zhdict/cedict-js
* @module @zhdict/cedict-js
* @nahanil/zhdict-lite
* @module @nahanil/zhdict-lite
*/
class Cedict {
class ZhdictLite {
constructor () {
this.db = DB()
}
/**
* Search the for a given word.
* @module cedict-js
* @function search
* @param {string|object} query - The word as a string, or a Sequelize `where` query.
* @returns {Promise}
......@@ -56,4 +55,4 @@ class Cedict {
}
}
module.exports = new Cedict()
module.exports = new ZhdictLite()
This diff is collapsed.
{
"name": "@zhdict/cedict-js",
"name": "@zhdict/zhdict-lite",
"version": "0.0.1",
"private": true,
"description": "",
......@@ -26,11 +26,6 @@
"type": "git",
"url": "https://git.carrotlabs.net/zhdict/cedict-js"
},
"dependencies": {
"lodash": "^4.17.11",
"sequelize": "^6.0.0",
"sqlite3": "^4.0.4"
},
"devDependencies": {
"@nahanil/bushou": "0.0.3",
"@nahanil/hsk-words": "^0.2.0",
......@@ -40,5 +35,10 @@
"jsdoc": "^3.6.2",
"n-readlines": "^1.0.0",
"standard": "^12.0.1"
},
"dependencies": {
"lodash": "^4.17.11",
"sequelize": "^6.0.0",
"sqlite3": "^4.0.4"
}
}
......@@ -9,7 +9,7 @@
- Unzip it
- Run this:
```bash
node ./lib/parser.js --input cedict_1_0_ts_utf-8_mdbg.txt --output cedict.sqlite
node ./lib/parser.js --input ./cedict_ts.u8 --output ./data/cedict.sqlite
```
#### OR
```bash
......@@ -18,11 +18,7 @@ npm run build
### Yeah, it's sluggish
So, to avoid waiting around for ths all day set the `--output` directory to some kind of ram disk.
Use this for your output directory so the SQLite writes are all done in RAM - should be quicker than hitting your disk. Just make sure that you move the completed database into a 'real' directory after it's been created otherwise it'll disappear at reboot.
**FYI**
Using Macbook SSD: 5m47.255s
Using MacBook RAMDisk: 2m56.968s
Use this for your output directory so the SQLite writes are all done in RAM - should be quicker than hitting your disk. Just make sure that you move the completed database into a 'real' directory after it's been created otherwise it'll disappear at reboot (this is basically what the "build" script does).
#### Linux
If you're running some variant of Linux you probably have access to the [`/dev/shm`](https://www.cyberciti.biz/tips/what-is-devshm-and-its-practical-usage.html) directory. Use it :)
......@@ -53,14 +49,14 @@ You're on your own
### Future
#### TODO (Maybe)
- It might be nice to store word frequency to order results in a potentially more useful manner - See [SUBTLEX-CH](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0010729), [Jun Da's Frequency statistics](http://lingua.mtsu.edu/chinese-computing/statistics/bigram/form.php).
- The [Unihan Database](http://www.unicode.org/charts/unihan.html) contains some interesting character data (stroke count, radical data, character variations, some characters not in CC-CEDICT, some character parts not in CC-CEDICT) as well that could be brought in.
- [ ] It might be nice to store word frequency to order results in a potentially more useful manner - See [SUBTLEX-CH](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0010729), [Jun Da's Frequency statistics](http://lingua.mtsu.edu/chinese-computing/statistics/bigram/form.php).
- [x] The [Unihan Database](http://www.unicode.org/charts/unihan.html) contains some interesting character data (stroke count, radical data, character variations, some characters not in CC-CEDICT, some character parts not in CC-CEDICT) as well that could be brought in.
#### Improvements
There are a few optimisations that could be made here, though not sure if it would be worth the effort.
- Removing `n-readlines` as a dependency and rather loading the entire file into memory shaved around a minute off the processing time in my (rather un-scientific) tests.
- Batching the SQLite inserts _may_ help to improve throughput, though if I add back-tracking to merge things like the following 國 example, those batches will add complexity/mess things up.
- Merge "duplicate" entries like the below:
- [ ] Removing `n-readlines` as a dependency and rather loading the entire file into memory shaved around a minute off the processing time in my (rather un-scientific) tests.
- [ ] Batching the SQLite inserts _may_ help to improve throughput, though if I add back-tracking to merge things like the following 國 example, those batches will add complexity/mess things up.
- [x] Merge "duplicate" entries like the below:
```
國 国 [Guo2] /surname Guo/
國 国 [guo2] /country/nation/state/national/CL:個|个[ge4]/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment