|
|
@ -13,6 +13,7 @@ const extensions = /^([^.]+$|\.(asp|aspx|cgi|htm|html|jsp|php)$)/;
|
|
|
|
class Driver {
|
|
|
|
class Driver {
|
|
|
|
constructor(pageUrl, options) {
|
|
|
|
constructor(pageUrl, options) {
|
|
|
|
this.options = Object.assign({}, {
|
|
|
|
this.options = Object.assign({}, {
|
|
|
|
|
|
|
|
chunkSize: 5,
|
|
|
|
debug: false,
|
|
|
|
debug: false,
|
|
|
|
delay: 500,
|
|
|
|
delay: 500,
|
|
|
|
maxDepth: 3,
|
|
|
|
maxDepth: 3,
|
|
|
@ -133,6 +134,7 @@ class Driver {
|
|
|
|
});
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
const links = Array.from(browser.document.getElementsByTagName('a'))
|
|
|
|
const links = Array.from(browser.document.getElementsByTagName('a'))
|
|
|
|
|
|
|
|
.filter(link => link.protocol === 'http:' || link.protocol === 'https:')
|
|
|
|
.filter(link => link.hostname === this.origPageUrl.hostname)
|
|
|
|
.filter(link => link.hostname === this.origPageUrl.hostname)
|
|
|
|
.filter(link => extensions.test(link.pathname))
|
|
|
|
.filter(link => extensions.test(link.pathname))
|
|
|
|
.map(link => { link.hash = ''; return url.parse(link.href) });
|
|
|
|
.map(link => { link.hash = ''; return url.parse(link.href) });
|
|
|
@ -256,7 +258,7 @@ class Driver {
|
|
|
|
return js;
|
|
|
|
return js;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
crawl(pageUrl, index = 1, depth = 1) {
|
|
|
|
crawl(pageUrl, index, depth = 1) {
|
|
|
|
pageUrl.canonical = pageUrl.protocol + '//' + pageUrl.host + pageUrl.pathname;
|
|
|
|
pageUrl.canonical = pageUrl.protocol + '//' + pageUrl.host + pageUrl.pathname;
|
|
|
|
|
|
|
|
|
|
|
|
return new Promise(resolve => {
|
|
|
|
return new Promise(resolve => {
|
|
|
@ -264,7 +266,7 @@ class Driver {
|
|
|
|
.catch(() => {})
|
|
|
|
.catch(() => {})
|
|
|
|
.then(links => {
|
|
|
|
.then(links => {
|
|
|
|
if ( links && Boolean(this.options.recursive) && depth < this.options.maxDepth ) {
|
|
|
|
if ( links && Boolean(this.options.recursive) && depth < this.options.maxDepth ) {
|
|
|
|
return Promise.all(links.map((link, index) => this.crawl(link, index + 1, depth + 1)));
|
|
|
|
return this.chunk(links.slice(0, this.options.maxUrls), depth + 1);
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
return Promise.resolve();
|
|
|
|
return Promise.resolve();
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -279,6 +281,20 @@ class Driver {
|
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chunk(links, depth, chunk = 0) {
|
|
|
|
|
|
|
|
if ( links.length === 0 ) {
|
|
|
|
|
|
|
|
return Promise.resolve();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const chunked = links.splice(0, this.options.chunkSize);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return new Promise(resolve => {
|
|
|
|
|
|
|
|
Promise.all(chunked.map((link, index) => this.crawl(link, index, depth)))
|
|
|
|
|
|
|
|
.then(() => this.chunk(links, depth, chunk + 1))
|
|
|
|
|
|
|
|
.then(() => resolve());
|
|
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
sleep(ms) {
|
|
|
|
sleep(ms) {
|
|
|
|
return ms ? new Promise(resolve => setTimeout(resolve, ms)) : Promise.resolve();
|
|
|
|
return ms ? new Promise(resolve => setTimeout(resolve, ms)) : Promise.resolve();
|
|
|
|
}
|
|
|
|
}
|
|
|
|