Commit 6211efc0 authored by Jarrod's avatar Jarrod 💬

Many zhdict things

parent 0ef4b430
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
const mcache = require('memory-cache');
module.exports = {
friendlyName: 'Get admin dashboard',
......@@ -8,7 +10,12 @@ module.exports = {
users: await User.count(),
words: await Word.count(),
wordLists: await WordList.count(),
searches24hr: await DictionarySearchLog.count({ createdAt: { '>=': Date.now() } })
searches24hr: await DictionarySearchLog.count({
createdAt: { '>=': new Date(Date.now() - (60 * 60 * 24 * 1000)) }
}),
cache: {
size: mcache.size()
}
}
if (typeof Comment !== 'undefined') {
......
module.exports = {
friendlyName: 'Get the daily idiom',
description: '',
inputs: {
},
fn: async function(inputs) {
// offset = (approx) number of days since 18 Dec 2018 UTC time
let offset = Math.floor(Date.now() / 8.64e7) - 17882;
let sql = `SELECT id, simplified, traditional, pinyin, definitions, "hskLevel"
FROM word
WHERE definitions &@ 'idiom'
AND length(traditional) = 4
ORDER BY id ASC
OFFSET ${offset}
LIMIT 1;`;
let result = await sails.sendNativeQuery(sql)
return result && result.rows && result.rows.length ? result.rows[0] : null
}
}
\ No newline at end of file
const util = require('util');
const decomp = require('@zhdict/zh-decomp');
const getUnihan = util.promisify(require('cjk-unihan').get);
module.exports = {
friendlyName: 'Get a dictionary entry',
description: '',
inputs: {
word: {
type: 'string',
required: true
},
full: {
type: 'ref',
defaultsTo: false
}
},
fn: async function(inputs) {
const { res } = this
const query = inputs.word
let response = {}
response.words = await Word.find({
where: {
or: [
{ simplified: query },
{ traditional: query },
]
}
})//.select(...)
.sort(['hskLevel ASC', 'frequency DESC', 'pinyin ASC'])
if (!response.words || !response.words.length) {
if (query.length === 1) {
return (await tryUnihanLookup(res, query))
}
return res.notFound()
}
let variants = [];
let allChars = [];
let charData = {};
let strokeData = {};
// response.words = words.map(async (w) => {
for (let i=0; i < response.words.length; i++) {
let w = response.words[i];
if (inputs.full) {
allChars.push(...w.traditional.split(''));
allChars.push(...w.simplified.split(''));
if (w.simplified.length === 1) {
if (w.variants) {
variants.push(...w.variants);
}
['simplified', 'traditional'].forEach((variant) => {
if (!charData[w[variant]]) {
let ucKey = variant === 'traditional' ? 'Traditional' : 'Simplified';
charData[w[variant]] = {
radical: w[`radical${ucKey}`],
strokes: w[`strokes${ucKey}`],
strokesMinRad: w[`strokesMinRad${ucKey}`],
decomp: decomp.lookupNg(w[variant])
};
}
});
}
}
response.words[i] = {
id: w.id,
traditional: w.traditional,
simplified: w.simplified,
definitions: w.definitions,
hskLevel: w.hskLevel || null,
pinyin: w.pinyin,
audio: await sails.helpers.zhdict.audio.getAudioUriForWord(w)
};
}
// Return a 'short version' of this data for the popup things
if (!inputs.full) {
return response
}
if (response.words[0].length !== 1) {
// Try tokenize/find components that makeup compound words
let parts = app.tokenizer_t.tokenize(response.words[0].traditional);
if (parts.length !== 1) {
response.parts = parts.map((r) => {
r.definitions = r.english ? r.english.split('/') : [];
delete r.english;
return r;
});
}
}
response.charData = charData;
if (variants && variants.length) {
response.variants = _.uniq(variants);
}
// Load stroke animation data
_.uniq(allChars).forEach((char) => {
try {
strokeData[char] = require('hanzi-writer-data/' + char);
} catch(e) {}
});
response.strokeData = strokeData;
// Try to sort results a bit lesss shitty-ly
if (response.words.length > 1) {
response.words = response.words.sort((a, b) => {
// return a - b
let x = a.definitions.filter(x => x.indexOf('variant') !== -1).length ? 1 : 0;
let y = b.definitions.filter(x => x.indexOf('variant') !== -1).length ? 1 : 0;
return x - y;
});
}
return response;
}
}
async function tryUnihanLookup(res, query) {
if (query.length !== 1) { return res.notFound('No entry found'); }
// TODO Check the unihan stuff
let char;
try {
char = await getUnihan(query);
} catch (err) {
sails.log.error('Unable to load unihan character data', err.stack)
return res.serverError('Unable to load results')
}
if (!char) {
return res.notFound('No entry found');
}
let response = {
words: [{
simplified: char.character,
traditional: char.character,
pinyin: (char.kHanyuPinyin || '').replace(/^[0-9.]+:/, ''),
definitions: char.kDefinition ? char.kDefinition.split(/;\s*/) : [],
}]
};
try {
strokes = require('hanzi-writer-data/' + char.character);
if (strokes) {
response.strokeData = {};
response.strokeData[`${char.character}`] = strokes;
}
} catch(e) {}
let charData = { };
let variants = [];
function parseVariants(input) {
// 'U+6C35<kMatthews,U+6C35<kMatthews'.match(/(U[+][0-9A-F]+)/g)
// let variants = [];
if (!input) { return; }
input.match(/(U[+][0-9A-F]+)/g).forEach((match) => {
variants.push(String.fromCharCode(parseInt(match.replace('U+', ''), 16)));
});
// return variants;
}
[
'kSemanticVariant',
'kSimplifiedVariant',
'kTraditionalVariant',
'kZVariant',
'kCompatibilityVariant',
'kSpecializedSemanticVariant'
].forEach((key) => {
if (char[key]) {
parseVariants(char[key]);
}
});
if (variants.length) {
response.variants = _.uniq(variants);
}
return response
}
\ No newline at end of file
module.exports = {
friendlyName: 'Get a HSK vocabulary list',
description: '',
inputs: {
level: {
type: 'number',
required: true,
isInteger: true,
min: 1,
max: 6
}
},
fn: async function(inputs) {
const words = await Word.find({ hskLevel: inputs.level })
.select(['simplified', 'traditional', 'pinyin', 'definitions'])
.sort(['frequency DESC', 'pinyin ASC'])
return { words }
}
}
\ No newline at end of file
const _ = require('lodash');
module.exports = {
friendlyName: 'Get a Chinese radical and characters containing it',
description: '',
todo: 'Use Word model rather than Sequelize thingy',
inputs: {
radical: {
type: 'string',
required: true
}
},
fn: async function(inputs) {
const radicals = require('../../data/radicals');
let radical = inputs.radical;
let info = radicals.filter((r) => { return r.radical === radical || r.simplified === radical || r.variants.match(radical) }).shift();
if (!info) {
return res.notFound('Radical not found')
}
const results = await Word.find({
where: {
or: [
{ radicalSimplified: radical },
{ radicalTraditional: radical }
]
}
}).select(['traditional', 'simplified', 'pinyin', 'definitions', 'radicalSimplified', 'radicalTraditional', 'strokesMinRadSimplified', 'strokesMinRadTraditional'])
let chars = {};
results.forEach((r) => {
if (r.radicalSimplified == radical) {
if (!chars[ r.strokesMinRadSimplified ]) { chars[ r.strokesMinRadSimplified ] = []; }
chars[ r.strokesMinRadSimplified ].push({
char: r.simplified,
pinyin: r.pinyin,
definitions: r.definitions
})
}
if (r.radicalTraditional == radical) {
if (!chars[ r.strokesMinRadTraditional ]) { chars[ r.strokesMinRadTraditional ] = []; }
chars[ r.strokesMinRadTraditional ].push({
char: r.traditional,
pinyin: r.pinyin,
definitions: r.definitions
})
}
});
// remove inevitable duplicates. #shitpile
Object.keys(chars).forEach((key) => {
chars[key] = _.uniqWith(chars[key], (a, b) => {
return a.char === b.char && (parseInt(a.pinyin.substr(-1)) || 5) === (parseInt(b.pinyin.substr(-1)) || 5);
});
});
return { radical: info, characters: chars };
}
}
\ No newline at end of file
module.exports = {
friendlyName: 'Get example sentences for a given word',
description: '',
inputs: {
word: {
type: 'string',
required: true
}
},
fn: async function(inputs) {
const { req, res } = this
const word = inputs.word.trim().replace(/[%_'"]/g, '')
if (!word) {
return res.badRequest('Invalid/missing query word')
}
const qyWord = `%${word}%`
const sentences = await ExampleSentence.find({
where: {
or: [
{ simplified: { like: qyWord } },
{ traditional: { like: qyWord } },
]
}
})
.select([ 'id', 'simplified', 'traditional', 'pinyin', 'translations', 'audio' ])
.limit(40) // 的 returns > 400 results!
// Munge and tokenify
sentences.forEach((s) => {
// s.audio = (req.secure ? 'https://' : 'http://') + req.headers.host + '/audio/' + s.audio;
s.tokens = { simplified: [], traditional: [] }
for (let t of sails.helpers.zhdict.tokenizer.tokenize(s.traditional)) {
s.tokens.simplified.push({ l: !!t.pinyin, w: t.simplified })
s.tokens.traditional.push({ l: !!t.pinyin, w: t.traditional })
}
});
return { sentences }
}
}
\ No newline at end of file
module.exports = {
friendlyName: 'Get a list of Chinese radicals',
description: '',
fn: async function(inputs) {
const radicals = require('../../data/radicals');
return {
radicals: radicals.map((r) => { r.traditional = r.radical; return r; })
}
}
}
\ No newline at end of file
const chineseConv = require('chinese-conv');
// const corenlp = require('../../lib/helpers/corenlp');
module.exports = {
friendlyName: 'Search the dictionary',
description: '',
inputs: {
q: {
type: 'string',
required: true,
minLength: 1,
maxLength: 100,
},
page: {
type: 'number',
isInteger: true,
defaultsTo: 1,
min: 1,
max: 99999,
}
},
fn: async function(inputs) {
const { req } = this
const { q, page } = inputs
const limit = 20
if (page < 1) { page = 1 }
let offset = page * limit - limit
let searchSql = `SELECT
traditional,
simplified,
pinyin,
definitions,
"hskLevel",
frequency,
pgroonga_score(tableoid, ctid) AS score,
CASE WHEN "hskLevel" = 6 THEN 1
WHEN "hskLevel" = 5 THEN 2
WHEN "hskLevel" = 4 THEN 3
WHEN "hskLevel" = 3 THEN 4
WHEN "hskLevel" = 2 THEN 5
WHEN "hskLevel" = 1 THEN 6
WHEN "hskLevel" is null THEN 0.00001
END,
((pgroonga_score(tableoid, ctid) + (w.frequency/5) + ("hskLevel" / 6)) /2) AS "orderScore"
FROM word w
WHERE traditional &@ $1
OR simplified &@ $1
OR definitions &@ $1
OR pronunciation &@ $1
ORDER BY "orderScore" DESC
NULLS LAST
OFFSET $3
LIMIT $2`
let countSql = `SELECT COUNT(1) AS count FROM word w
WHERE traditional &@ $1
OR simplified &@ $1
OR definitions &@ $1
OR pronunciation &@ $1`
let results
let totalResults
try {
results = (await sails.sendNativeQuery(searchSql, [ q, limit, offset ])).rows
totalResults = (await sails.sendNativeQuery(countSql, [ q ])).rows[0].count
} catch (err) {
console.error(`Unable to perform dictionary search for ${q}`, err.stack)
return res.status(500).json('Unable to perform search')
}
// Fall back to splitting a potentially long input to find chinesey words
if (!results || !results.length) {
return tryTokenizedSearch(this.req, q)
}
DictionarySearchLog.create({
query: q,
user: req.me ? req.me.id : null,
ip: req.ip,
totalResults: totalResults
}, (err, res) => { /* Do this 'in the background'... */ })
// console.log(results)
// results = results.sort((a,b) => {
// return b.sortScore - a.sortScore
// })
return {
total: totalResults,
pages: Math.ceil(totalResults / limit),
results: results
}
}
}
function tryTokenizedSearch(req, query) {
// Normalize query (trad/simp)
query = chineseConv.tify(query);
let tokens = app.tokenizer_t.tokenize(query).filter((t) => { return t.pinyin && t.english });
DictionarySearchLog.create({
query,
user: req.me ? req.me.id : null,
ip: req.ip,
totalResults: tokens.length
}, (err, res) => { /* Do this 'in the background'... */ })
if (!tokens || !tokens.length) {
return { total: 0, pages: 0, results: [] }
}
tokens = tokens.map((t) => {
t.definitions = t.english.split('/');
t.pinyin = t.pinyin.replace(/[/]/g, ' / ');
delete t.english;
return t;
});
return {
total: tokens.length,
pages: 1,
results: tokens
}
}
// TODO: This uses CoreNLP to tokenize the string, then the old node/cedict
// tokenizer to split up any tokenized-but-not-found-in-cedict-words into
// their smaller components. Not sure if CoreNLP will handle heavy traffic
// well, so for now just using the old method `tryTokenizedSearch()`
/*
async function tryTokenizedSearchNg(req, res, next) {
let query = chineseConv.sify(req.inputs.q);
let parsed = await corenlp.tokenize(query);
let tokens = [];
let ogTokens = [];
if (parsed && parsed.sentences) {
parsed.sentences.forEach((s) => {
if (s.tokens) {
s.tokens.forEach((t) => {
ogTokens.push(t)
tokens.push(
...app.tokenizer_t.tokenize(chineseConv.tify(t.word), true).map((s) => {
if (s.pinyin) s.pinyin.replace(/[/]/g, ' / ');
s.definitions = s.english ? s.english.split('/') : [];
delete s.english;
return s;
})
)
})
}
})
}
return res.status(419).json({
ogTokens,
tokens,
});
}
*/
/*
* corenlp helper
*/
/*
const tokenizer_t = require("@zhdict/tokenizer")(require('path').join(__dirname, '../../data/cedict_ts.u8'), 'traditional');
const request = require('request');
const querystring = require('querystring');
// wget -q --post-data "這藥會讓你好受點。"
// 'localhost:9999/?properties={"annotators":"openie","outputFormat":"json"}&pipelineLanguage=zh'
async function tokenize (input) {
console.log('corenlp')
let url = 'http://localhost:9999';
const params = {
properties: JSON.stringify({
annotators: 'openie',
outputFormat: 'json'
}),
pipelineLanguage: 'zh'
};
console.log('corenlp, return promise')
return new Promise((resolve, reject) => {
console.log('corenlp run promise: ', url + '?' + querystring.stringify(params))
request.post(url + '?' + querystring.stringify(params), {
body: input
}, function (err, res, body) {
console.log('corenlp, request done', err)
if (err) return reject(err);
console.log('corenlp, resolving')
try {
return resolve(JSON.parse(body));
} catch (e) { return reject(e); }
});
});
}
module.exports = {
tokenize
};
*/
\ No newline at end of file
const util = require('util');
const fs = require('fs');
const fileExists = util.promisify(fs.exists);
const Path = require('path')
const pronunciation = require('zh-pronunciation');
const Polly = require('polly-tts');
// These should be configured upstream but bleh
const audioFileBasePath = Path.join(sails.config.appPath, '/public/audio');
// AWS POLLY 'pollyuser'
let ACCESS_KEY_ID = 'AKIAJDFJZZ72SHP7EYCA';
let SECRET_ACCESS_KEY = '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';
let TTS_VOICE = 'Zhiyu'
function tts(text, output) {
let polly = new Polly({ accessKeyId: ACCESS_KEY_ID, secretAccessKey: SECRET_ACCESS_KEY });
return new Promise((resolve, reject) => {
polly.textToSpeech({ text: text, voiceId: TTS_VOICE }, (err, audioStream) => {
if (err) { return reject(err.message); }
let fileStream = fs.createWriteStream(output);
audioStream.pipe(fileStream)
.on('finish', function() {
resolve();
});
});
});
};
function getAudioUriForWord(word) {
return new Promise(async function(resolve, reject) {
// Do we have the pinyin audio?
let exists;
let tryFile = 'word/pinyin/' + pronunciation.convert(word.pinyin).replace(/\s/g, '') + '.mp3';
exists = await fileExists(Path.join(audioFileBasePath, tryFile)).catch((e) => {});
if (exists) {
return resolve(tryFile);
}
// Do we have the 中文 audio?
tryFile = 'word/char/' + word.simplified.replace(/\s/g, '') + '.mp3';
exists = await fileExists(Path.join(audioFileBasePath, tryFile))
.catch((e) => {});
if (exists) {
return resolve(tryFile);
}
// Try to get (and store) TTS version
try {
await tts(word.simplified, Path.join(audioFileBasePath, tryFile));
} catch (err) {
sails.log.error(`Failed to retrieve TTS audio`, err);
return resolve(null);
}
return resolve(tryFile);
});
};
module.exports = {
tts,
getAudioUriForWord,
};
/**
* Forum
*/
const path = require('path')
const Pluggable = require('@nahanil/ahoy/lib/pluggable')
const tokenizer_t = require("@zhdict/tokenizer")(require('path').join(__dirname, '../data/cedict_ts.u8'), 'traditional');
const tokenizer = require("@zhdict/tokenizer")(
path.join(__dirname, '../data/cedict_ts.u8'),
'traditional'
);
module.exports = new Pluggable({
apiBase: '',
......@@ -11,14 +15,13 @@ module.exports = new Pluggable({
actions: {
// Public routes
/*
'GET /dictionary/:word': { action: 'get-entry', cache: 30 },
'GET /sentences/:word': { action: 'get-sentences', cache: 30 },
'GET /search': { action: 'search' },
'GET /daily-idiom': { action: 'daily-idiom', cache: 60 * 60 },
'GET /radicals': { action: 'radicals', cache: 86400 },
'GET /dictionary/:word': { action: 'get-entry', cache: 86400 },
'GET /sentences/:word': { action: 'get-sentences', cache: 86400 },
'GET /search': { action: 'search-dictionary', cache: 300 },
'GET /daily-idiom': { action: 'get-daily-idiom', cache: 60 * 60 },
'GET /radicals': { action: 'list-radicals', cache: 86400 },
'GET /radicals/:radical': { action: 'get-radical', cache: 86400 },
*/
'GET /hsk/list/:level': { action: 'get-hsk-list', cache: 86400 },
// Admin Routes
'GET /admin/dashboard': {
......@@ -28,6 +31,7 @@ module.exports = new Pluggable({
},