...
 
Commits (2)
......@@ -5,3 +5,5 @@ docs
coverage
package-lock.json
yarn.lock
.tmp
nohup.out
node_modules
docs
*.swp
.nyc_output/
coverage
package-lock.json
yarn.lock
.tmp
nohup.out
......@@ -86,3 +86,7 @@ let results = await cedict.searchByRadical('水')
]
*/
```
### See also
- [parser.md](./parser.md)
- [CC-CEDICT download](https://www.mdbg.net/chinese/dictionary?page=cedict)
\ No newline at end of file
No preview for this file type
This source diff could not be displayed because it is too large. You can view the blob instead.
module.exports = (options) => {
options = options || {}
const path = require('path')
const Sequelize = require('sequelize')
options = options || {}
options.dbpath = options.dbpath || path.join(__dirname, '../../data/cedict.sqlite')
const sequelize = new Sequelize('cedict', null, null, {
dialect: 'sqlite',
logging: false,
storage: path.join(__dirname, '../../data/cedict.sqlite')
storage: options.dbpath
})
return {
......
......@@ -16,6 +16,9 @@ module.exports = (sequelize, Sequelize) => {
english: {
type: Sequelize.STRING
},
hskLevel: {
type: Sequelize.INTEGER
},
radicalSimplified: {
type: Sequelize.STRING
},
......
const fs = require('fs')
const _ = require('lodash')
const commandLineArgs = require('command-line-args')
const LineByLine = require('n-readlines')
const crypto = require('crypto')
const unihan = require('cjk-unihan')
const hsk = require('@nahanil/hsk-words')
const bushou = require('@nahanil/bushou')
const keys = ['traditional', 'simplified', 'pinyin', 'english']
const regex = /(.*)\s+(.*)\s+\[([^]+)\]\s+[/](.*)[/]/
const getUnihan = (key, field) => {
return new Promise((resolve, reject) => {
unihan.get(key, field, (err, res) => {
if (err) {
return reject(err)
}
resolve(res)
})
})
}
// Handle command line options
const optionDefinitions = [
{ name: 'verbose', alias: 'v', type: Boolean, defaultValue: false },
{ name: 'input', type: String, defaultValue: './cedict_ts.u8' },
{ name: 'output', type: String, defaultValue: './cedict.sqlite' }
]
const options = commandLineArgs(optionDefinitions)
options.verbose && console.log('Got command line options: ', options)
// Check the input file exists
if (!fs.existsSync(options.input)) {
console.error('Input file not found: ' + options.input)
process.exit(1)
}
const db = require('./models')({
dbpath: options.output
})
// Once we have access to the database we can start the fun stuff
console.log('Sync database')
let i = 0
db.sequelize.sync({ force: true }).then(async () => {
console.log(' * Done!')
const hskWords = await hsk.getWordList('%')
console.log(`Found ${hskWords.length} HSK Words`)
console.log('Starting to read file')
const liner = new LineByLine(options.input)
// Loop through every line in the file
const seen = []
let _line = liner.next()
while (_line) {
// Gotta be utf8
let line = _line.toString('utf8')
_line = liner.next()
// Skip comments
if (line.charAt(0) === '#') { continue }
// Parse the line into an object
let data = _.zipObject(keys, line.match(regex).slice(1, 5))
// Ensure the pinyin is always lowercase..
data.pinyin = data.pinyin.toLowerCase()
// Generate the ID
data.id = crypto.createHash('sha1')
.update(data.simplified + data.traditional + data.pinyin)
.digest('hex')
// Merge with the other variant(s), ie Jia1/jia1/家/家
if (seen.indexOf(data.id) !== -1) {
let existing = await db.word.findOne({ where: { id: data.id } })
existing.english = [].concat(data.english.split('/'), existing.english).join('/')
await existing.save()
options.verbose && console.log(`\nUpdated ${data.simplified} [${data.traditional}] /${data.pinyin}/`)
continue
}
seen.push(data.id)
// Add extra data for single characters
if (data.simplified.length === 1) {
// Add radical for single characters (keeping in mind that it may be different for simplified/traditional characters)
data.radicalSimplified = bushou.for(data.simplified)
data.radicalTraditional = bushou.for(data.simplified)
// Add some unihan data if we can find it
let unihanData = {
Simplified: await getUnihan(data.simplified),
Traditional: await getUnihan(data.traditional)
}
let variants = []
;(['Simplified', 'Traditional']).forEach((variant) => {
if (unihanData[variant]) {
// Add strokes
if (unihanData[variant].kTotalStrokes) {
data[`strokes${variant}`] = parseInt(unihanData[variant].kTotalStrokes) || null
}
if (unihanData[variant].kCangjie) {
data[`cangjie${variant}`] = unihanData[variant].kCangjie || null
}
// Add radical
// kRSKangXi - 40.7
// the 40 is the radical, ie 宀
// the 7 is additional strokes
if (unihanData[variant].kRSKangXi) {
let parts = unihanData[variant].kRSKangXi.split('.')
let r = bushou.byIndex(parts[0])
if (r) {
let radical = r[ variant === 'Simplified' && r.simplified ? 'simplified' : 'radical' ]
data[`radical${variant}`] = radical
data[`strokesMinRad${variant}`] = parts[1]
}
}
const parseVariants = (input) => {
// 'U+6C35<kMatthews,U+6C35<kMatthews'.match(/(U[+][0-9A-F]+)/g)
// let variants = []
if (!input) { return }
input.match(/(U[+][0-9A-F]+)/g).forEach((match) => {
variants.push(String.fromCharCode(parseInt(match.replace('U+', ''), 16)))
})
// return variants
}
unihanData[variant].kZVariant && variants.push(parseVariants(unihanData[variant].kZVariant))
unihanData[variant].kSemanticVariant && variants.push(parseVariants(unihanData[variant].kSemanticVariant))
unihanData[variant].kCompatibilityVariant && variants.push(parseVariants(unihanData[variant].kCompatibilityVariant))
unihanData[variant].kSpecializedSemanticVariant && variants.push(parseVariants(unihanData[variant].kSpecializedSemanticVariant))
if (unihanData[variant][`k${variant}Variant`]) {
variants.push(parseVariants(unihanData[`k${variant}Variant`]))
data[`variants${variant}`] = parseVariants(unihanData[`k${variant}Variant`])
}
}
})
if (variants.length) {
data.variants = _.uniq(variants.filter(v => !!v)).join(',')
}
}
// Find hsk level?
let inHsk = hskWords.filter(w => w.simplified === data.simplified && w.pronunciation === data.pinyin).shift()
if (inHsk) {
data.hskLevel = inHsk.level
}
// TODO: Add word 'frequency' so search results can be ordered more usefully
try {
await db.word.create(data)
} catch (err) {
console.error('Failed to insert record: ', JSON.stringify(data))
console.error(err, err.stack)
process.exit(1)
}
++i
process.stdout.write(`\rInserted: ${i}`)
}
})
{
"name": "@zhdict/cedict-js",
"version": "0.0.1",
"private": true,
"description": "",
"keywords": [
"cedict",
"cjk",
"中文"
],
"main": "lib/index.js",
"directories": {
"lib": "lib"
"lib": "lib",
"test": "test"
},
"scripts": {
"test": "npm run lint && npm run unit-test",
"unit-test": "jest --verbose --coverage",
"lint": "standard 'lib/**/*.js'",
"doc": "rm -rf docs && jsdoc -R README.md -d docs -r ./lib"
"doc": "rm -rf docs && jsdoc -R README.md -d docs -r ./lib",
"build": "rm -rf .tmp && mkdir .tmp && cd .tmp && wget 'https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.zip' && unzip cedict_1_0_ts_utf-8_mdbg.zip && node ../lib/parser.js --input cedict_ts.u8 --output /dev/shm/cedict.sqlite && mkdir -p ../data && cp /dev/shm/cedict.sqlite ../data/cedict.sqlite && cd .. && rm -rf .tmp && rm /dev/shm/cedict.sqlite"
},
"author": "Jarrod Linahan <jarrod@linahan.id.au>",
"license": "MIT",
"repository": {
"type": "git",
"url": "https://git.carrotlabs.net/zhdict/cedict-js"
},
"keywords": [
"cedict",
"cjk",
"中文"
],
"author": "",
"license": "MIT",
"devDependencies": {
"jest": "^24.8.0",
"jsdoc": "^3.6.2",
"standard": "^12.0.1"
},
"dependencies": {
"lodash": "^4.17.11",
"sequelize": "^6.0.0",
"sqlite3": "^4.0.4"
},
"devDependencies": {
"@nahanil/bushou": "0.0.3",
"@nahanil/hsk-words": "^0.2.0",
"cjk-unihan": "0.0.3",
"command-line-args": "^5.0.2",
"jest": "^24.8.0",
"jsdoc": "^3.6.2",
"n-readlines": "^1.0.0",
"standard": "^12.0.1"
}
}
# cc-cedict-parser
### Installation
- Clone this repository
- Install dependencies with `npm install` from the project directory
### Usage
- Download the latest CC-CEDICT release from [https://www.mdbg.net/chinese/dictionary?page=cedict](https://www.mdbg.net/chinese/dictionary?page=cedict)
- Unzip it
- Run this:
```bash
node ./lib/parser.js --input cedict_1_0_ts_utf-8_mdbg.txt --output cedict.sqlite
```
#### OR
```bash
npm run build
```
### Yeah, it's sluggish
So, to avoid waiting around for ths all day set the `--output` directory to some kind of ram disk.
Use this for your output directory so the SQLite writes are all done in RAM - should be quicker than hitting your disk. Just make sure that you move the completed database into a 'real' directory after it's been created otherwise it'll disappear at reboot.
**FYI**
Using Macbook SSD: 5m47.255s
Using MacBook RAMDisk: 2m56.968s
#### Linux
If you're running some variant of Linux you probably have access to the [`/dev/shm`](https://www.cyberciti.biz/tips/what-is-devshm-and-its-practical-usage.html) directory. Use it :)
#### OSX
If you're running OSX, there's a nifty guide to creating/mounting a RAMDisk [here, on StackOverflow](https://stackoverflow.com/a/2033417/742129).
TL;DR - Creating a 100MB RAMDisk
```bash
$ hdiutil attach -nomount ram://$((2 * 1024 * 100))
/dev/disk3
$ diskutil eraseVolume HFS+ RAMDisk /dev/disk3
Started erase on disk3
Unmounting disk
Erasing
Initialized /dev/rdisk3 as a 100 MB case-insensitive HFS Plus volume
Mounting disk
Finished erase on disk3 RAMDisk
```
Be sure that the last parameter to the second command is the output of the first, otherwise you risk messing up your other partitions :)
Open finder - You should see the new RAMDisk in the list of 'Locations', and find it under `/Volumes/RAMDisk` from the command line.
#### Windows
¯\\\_(ツ)\_
You're on your own
### Future
#### TODO (Maybe)
- It might be nice to store word frequency to order results in a potentially more useful manner - See [SUBTLEX-CH](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0010729), [Jun Da's Frequency statistics](http://lingua.mtsu.edu/chinese-computing/statistics/bigram/form.php).
- The [Unihan Database](http://www.unicode.org/charts/unihan.html) contains some interesting character data (stroke count, radical data, character variations, some characters not in CC-CEDICT, some character parts not in CC-CEDICT) as well that could be brought in.
#### Improvements
There are a few optimisations that could be made here, though not sure if it would be worth the effort.
- Removing `n-readlines` as a dependency and rather loading the entire file into memory shaved around a minute off the processing time in my (rather un-scientific) tests.
- Batching the SQLite inserts _may_ help to improve throughput, though if I add back-tracking to merge things like the following 國 example, those batches will add complexity/mess things up.
- Merge "duplicate" entries like the below:
```
國 国 [Guo2] /surname Guo/
國 国 [guo2] /country/nation/state/national/CL:個|个[ge4]/
```
\ No newline at end of file
......@@ -5,7 +5,7 @@ describe('search', () => {
let result = await cedict.search('水')
expect(result.length).toBe(1)
expect(result[0]).toMatchObject({
id: 'a1e7fdf92f84aaca2675121d4b2b54eb5122bd97',
id: '3f69904700c4fe2fb2caeb3f46501bc66ce245a4',
simplified: '水',
traditional: '水',
pinyin: 'shui3',
......@@ -26,7 +26,7 @@ describe('search', () => {
let result = await cedict.search({ simplified: '水' })
expect(result.length).toBe(1)
expect(result[0]).toMatchObject({
id: 'a1e7fdf92f84aaca2675121d4b2b54eb5122bd97',
id: '3f69904700c4fe2fb2caeb3f46501bc66ce245a4',
simplified: '水',
traditional: '水',
pinyin: 'shui3',
......
......@@ -5,7 +5,7 @@ describe('search', () => {
cedict.search('水').then((result) => {
expect(result.length).toBe(1)
expect(result[0]).toMatchObject({
id: 'a1e7fdf92f84aaca2675121d4b2b54eb5122bd97',
id: '3f69904700c4fe2fb2caeb3f46501bc66ce245a4',
simplified: '水',
traditional: '水',
pinyin: 'shui3',
......@@ -29,7 +29,7 @@ describe('search', () => {
.then((result) => {
expect(result.length).toBe(1)
expect(result[0]).toMatchObject({
id: 'a1e7fdf92f84aaca2675121d4b2b54eb5122bd97',
id: '3f69904700c4fe2fb2caeb3f46501bc66ce245a4',
simplified: '水',
traditional: '水',
pinyin: 'shui3',
......