Commit 665ba4de authored by Jan Potoms's avatar Jan Potoms

Initial commit

parents
node_modules
\ No newline at end of file
node_modules
\ No newline at end of file
{
"indent" : 2,
"maxlen" : 80,
"maxerr" : 50,
"node" : true,
"devel" : true,
"curly" : true,
"eqeqeq" : true,
"immed" : true,
"latedef" : true,
"newcap" : true,
"noarg" : true,
"noempty" : true,
"nonew" : true,
"unused" : true,
"trailing" : true,
"strict" : true,
"quotmark" : true,
"sub" : true,
"undef" : true,
"funcscope" : true,
"validthis" : true,
"white" : true,
"predef" : [
"-Promise"
]
}
\ No newline at end of file
# robots-txt-parse
Streaming robots.txt parser
## usage
```js
var parse = require('robots-txt-parse'),
fs = require('fs');
parse(fs.createReadStream(__dirname + '/robots.txt'))
.then(function (robots) {
console.log(robots)
});
```
assuming this file
```
user-agent: *
user-agent: googlebot
disallow: /
user-agent: twitterbot
disallow: /
allow: /twitter
Sitemap: http://www.example.com/sitemap.xml
```
produces following output
```json
{
"groups": [{
"agents": [ "*", "googlebot" ],
"rules": [
{ "rule": "disallow", "path": "/" }
]
}, {
"agents": [ "twitterbot" ],
"rules": [
{ "rule": "disallow", "path": "/" },
{ "rule": "allow", "path": "/twitter" }
]
}],
"extensions": [
{ "extension": "sitemap", "value": "http://www.example.com/sitemap.xml" }
]
}
```
\ No newline at end of file
'use strict';
var split = require('split'),
through = require('through'),
combine = require('stream-combiner'),
Promise = require('bluebird');
var START_GROUP = 'START_GROUP',
GROUP_MEMBER = 'GROUP_MEMBER',
NON_GROUP = 'NON_GROUP';
function parseLine(line) {
var commentFree = line.replace(/#.*$/, ''),
parts = commentFree.split(':');
if (parts.length !== 2) {
return null;
}
var field = parts[0].trim().toLowerCase(),
value = parts[1].trim();
switch (field) {
case 'user-agent':
return {
type : START_GROUP,
agent: value
};
case 'allow':
case 'disallow':
return {
type: GROUP_MEMBER,
rule: field,
path: value
};
default:
return {
type : NON_GROUP,
field: field,
value: value
};
}
}
function tokenize() {
return through(function (line) {
var token = parseLine(line);
if (token) {
this.queue(token);
}
});
}
module.exports = function parse(content) {
var result = {
groups: [],
extensions: []
};
var prevToken = null,
currentGroup = null;
var build = through(function (token) {
switch (token.type) {
case START_GROUP:
if (prevToken !== START_GROUP) {
currentGroup = {
agents : [],
rules : []
};
result.groups.push(currentGroup);
}
currentGroup.agents.push(token.agent);
break;
case GROUP_MEMBER:
if (currentGroup) {
currentGroup.rules.push({
rule: token.rule,
path: token.path
});
}
break;
case NON_GROUP:
result.extensions.push({
extension: token.field,
value: token.value
});
break;
}
prevToken = token.type;
});
return new Promise(function (resolve, reject) {
combine(
content,
split(),
tokenize(),
build
)
.on('error', reject)
.on('end', function () {
resolve(result);
});
});
};
{
"name": "robots-txt-parse",
"version": "0.0.1",
"description": "Streaming parser for robots.txt files",
"main": "lib/parse.js",
"scripts": {
"test": "mocha -R spec ./test",
"test-watch": "mocha -w -R spec ./test"
},
"author": "Jan Potoms",
"license": "MIT",
"devDependencies": {
"chai": "^1.9.1",
"mocha": "^1.18.2"
},
"dependencies": {
"bluebird": "^2.3.5",
"split": "^0.3.0",
"stream-combiner": "^0.2.1",
"through": "^2.3.4"
}
}
allow: '/error'
user-agent: *
disallow: '/'
\ No newline at end of file
user-agent: *
user-agent: agent1
user-agent: agent2
disallow: /
\ No newline at end of file
user-agent: *
user-agent: agent1
disallow: /path1
allow: /path2
user-agent: agent2
allow: /
user-agent: agent3
disallow: /path3
user-agent: *
disallow: /
\ No newline at end of file
user-agent: *
disallow: /
sitemap: /sitemap.xml
\ No newline at end of file
/*global describe, it*/
'use strict';
var parse = require('../lib/parse'),
fs = require('fs'),
path = require('path'),
assert = require('chai').assert;
function getFixture(name) {
var fixturePath = path.resolve(__dirname, 'fixtures', name + '.txt');
return fs.createReadStream(fixturePath);
}
describe('parser', function () {
it('should parse a simple group', function (done) {
parse(getFixture('single-group'))
.then(function (parsed) {
assert.isObject(parsed);
assert.property(parsed, 'groups');
assert.isArray(parsed.groups);
assert.lengthOf(parsed.groups, 1);
var group = parsed.groups[0];
assert.isObject(group);
assert.property(group, 'agents');
assert.isArray(group.agents);
assert.lengthOf(group.agents, 1);
assert.strictEqual(group.agents[0], '*');
assert.property(group, 'rules');
assert.isArray(group.rules);
assert.lengthOf(group.rules, 1);
var rule = group.rules[0];
assert.isObject(rule);
assert.propertyVal(rule, 'rule', 'disallow');
assert.propertyVal(rule, 'path', '/');
done();
})
.catch(done);
});
it('should parse multiple agents', function (done) {
parse(getFixture('multiple-agents'))
.then(function (parsed) {
assert.deepPropertyVal(parsed, 'groups[0].agents[0]', '*');
assert.deepPropertyVal(parsed, 'groups[0].agents[1]', 'agent1');
assert.deepPropertyVal(parsed, 'groups[0].agents[2]', 'agent2');
done();
})
.catch(done);
});
it('should ignore group members outside of a group', function (done) {
parse(getFixture('member-outside'))
.then(function (parsed) {
assert.deepPropertyVal(parsed, 'groups[0].agents[0]', '*');
assert.lengthOf(parsed.groups[0].agents, 1);
done();
})
.catch(done);
});
it('should parse extensions', function (done) {
parse(getFixture('with-sitemap'))
.then(function (parsed) {
assert.deepPropertyVal(parsed, 'extensions[0].extension', 'sitemap');
assert.deepPropertyVal(parsed, 'extensions[0].value', '/sitemap.xml');
done();
})
.catch(done);
});
it('should parse multiple groups', function (done) {
parse(getFixture('multiple-groups'))
.then(function (parsed) {
assert.deepPropertyVal(parsed, 'groups[0].agents[0]', '*');
assert.deepPropertyVal(parsed, 'groups[0].agents[1]', 'agent1');
assert.deepPropertyVal(parsed, 'groups[0].rules[0].rule', 'disallow');
assert.deepPropertyVal(parsed, 'groups[0].rules[0].path', '/path1');
assert.deepPropertyVal(parsed, 'groups[0].rules[1].rule', 'allow');
assert.deepPropertyVal(parsed, 'groups[0].rules[1].path', '/path2');
assert.deepPropertyVal(parsed, 'groups[1].agents[0]', 'agent2');
assert.deepPropertyVal(parsed, 'groups[1].rules[0].rule', 'allow');
assert.deepPropertyVal(parsed, 'groups[1].rules[0].path', '/');
assert.deepPropertyVal(parsed, 'groups[2].agents[0]', 'agent3');
assert.deepPropertyVal(parsed, 'groups[2].rules[0].rule', 'disallow');
assert.deepPropertyVal(parsed, 'groups[2].rules[0].path', '/path3');
done();
})
.catch(done);
});
});
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment