When use it?
When you want to retrieve information about the page pointed by the URL that your user has just copied, scrapinode is a great fit. First scrapinode comes out of the box with a great feature of 1 line of code, that will give you the title, the description and the images of any HTML page on the web. Second if you need more, you can extend it. See the examples to know more about it.
Features
- Retrieve content like "title", "descriptions", "images", "videos" on any HTML pages with 1 line of code.
- Define specific operations based on the url of the page and the content you hope retrieve with regex.
- Scrape pages with jsdom + jquery or with cheerio.
- The HTTP client takes care to handle HTTP and HTML redirections.
- Scrape image like it was a HTML page.
Install
npm install scrapinode
Usage
var scrapinode = require('scrapinode');
// Define an operation for a specific route and content
scrapinode.use('society6.com','title',function(window){
var $ = window.$;
var url = window.location.href; // url of the page maybe you want to check for some reasons
var title = $('h1[itemprop="name"]').text();
if(!title) return null;
return title;
});
// Use default operations for content like "title", "descriptions", "images", "videos"
scrapinode.useAll(scrapinode.defaults());
scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
if(err) return console.error(err);
var title = scraper.get('title');
console.log(title); // "Sound Good dude"
});
Test
npm test
Test coverage
make coverage
Examples
Define a new operation for a route
A route is caracterized by a content and path (there are regex or string), also an operation is attached to each route.
var scrapinode = require('./../');
scrapinode.use(/http:\/\/society6\.com/,'title',function(window){
var $ = window.$;
var title = $('h1[itemprop="name"]').text();
return title;
});
scrapinode.use(/http:\/\/society6\.com/,'artist',function(window){
var $ = window.$;
var artist = $('.details>h3>a').text();
return artist;
});
scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
if(err) return console.error(err);
var title = scraper.get('title');
var artist = scraper.get('artist');
console.log(title,artist); // "Sounds Good Dude Chase Kunz"
});
scrapinode.createScraper('http://society6.com/product/the-lord-of-fashion_T-shirt',function(err,scraper){
if(err) return console.error(err);
var title = scraper.get('title');
var artist = scraper.get('artist');
console.log(title,artist); // "the lord of fashion H A P P Y J O Y"
});
Terminology
If you wonder what I mean by "path", "content", "operation" and "route" this is your answer.
var scrapinode = require('./..');
var path = /http:\/\/society6.com/;
var content = 'title';
// route is association of a "path" and a "content"
// each route has its "operation" associated
var operation = function(window){
var $ = window.$;
var title = $('h1[itemprop="name"]').text();
return title;
};
scrapinode.use(path,content,operation);
scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
if(err) return console.error(err);
var title = scraper.get('title');
console.log(title); // "Sound Good dude"
});
Use jsdom and jquery
var scrapinode = require('./../');
scrapinode.use(/http:\/\/society6.com/,'title',function(window){
var $ = window.$;
var title = $('h1[itemprop="name"]').text();
return title;
});
var options = {
url : 'http://society6.com/product/Sounds-Good-Dude_T-shirt',
engine : 'jsdom'
};
scrapinode.createScraper(options,function(err,scraper){
if(err) return console.error(err);
var title = scraper.get('title');
console.log(title); // "Sound Good dude"
});
Get the title and the description
var scrapinode = require('./../');
scrapinode.useAll(scrapinode.defaults());
scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
if(err) return console.error(err);
var title = scraper.get('title');
var description = scraper.get('descriptions')[0];
console.log(title,description);
// "Sound Good dude American Apparel T-shirts are made with 100% fine jersey cotton combed for softness and comfort.
// (Athletic Grey and Athletic Blue contain 50% polyester / 25% cotton / 25% rayon)"
});
Use jsdom and zepto
var scrapinode = require('./../');
scrapinode.use(/http:\/\/society6.com/,'title',function(window){
var $ = window.$;
var title = $('h1[itemprop="name"]').text();
return title;
});
var options = {
url : 'http://society6.com/product/Sounds-Good-Dude_T-shirt',
engine : 'jsdom+zepto'
};
scrapinode.createScraper(options,function(err,scraper){
if(err) return console.error(err);
var title = scraper.get('title');
console.log(title); // "Sound Good dude"
});
Take advantage of the middleware system
The first route/operation used in the stack (router) should be specific and the others less and less specific, more generic.
var scrapinode = require('./../');
scrapinode.use(/http:\/\/society6\.com/,'title',function(window){
var $ = window.$;
// this selector is really appropriate for product pages not for the index page
var title = $('h1[itemprop="name"]').text();
// if title is falsy, the operation will return "null",
// which means that the next matching route will be used
// and so its associated operation will be called with the same "window"
// this logic goes on and on until an operation returns something else than "null"
console.log('see the title here is empty: "' + title + '"');
if(!title) return null;
return title;
});
scrapinode.use('*','title',function(window){
var $ = window.$;
var title = $('title').text();
return title;
});
scrapinode.createScraper('http://society6.com',function(err,scraper){
if(err) return console.error(err);
var title = scraper.get('title');
console.log(title); // "Society6 | Affordable Art Prints, iPhone Cases and T-shirts"
});
Use cheerio
var scrapinode = require('./../');
scrapinode.use(/http:\/\/society6.com/,'title',function(window){
var $ = window.$;
var title = $('h1[itemprop="name"]').text();
return title;
});
var options = {
url : 'http://society6.com/product/Sounds-Good-Dude_T-shirt',
engine : 'cheerio'
};
scrapinode.createScraper(options,function(err,scraper){
if(err) return console.error(err);
var title = scraper.get('title');
console.log(title); // "Sound Good dude"
});
Simple usage
Scrap and get the title of the page using the default operators.
var scrapinode = require('./../');
scrapinode.useAll(scrapinode.defaults());
scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
if(err) return console.error(err);
var title = scraper.get('title');
console.log(title); // "Sound Good dude"
});
Code documentation
index.js
module.exports = require('./lib/scrapinode')
Expose the scrapinode library
module.exports = require('./lib/scrapinode');
lib/router.js
ScrapinodeError = require('./error/scrapinode-error')
Module dependencies
var ScrapinodeError = require('./error/scrapinode-error');
module.exports = Router
Expose Router
module.exports = Router;
Router()
Router
class
function Router(){
this.routes = [];
}
Router.prototype.addRoute(route)
Add the given route
into this.routes
Router.prototype.addRoute = function(route){
this.routes.push(route);
return this;
};
Parameters:
- route {Route} -
Returns:
- {Router}
Router.prototype.dispatch(url,content,index)
Find the relevant operation for the given url
and content
expected
Router.prototype.dispatch = function(url,content,index){
for(var i = index; i < this.routes.length; i++){
var route = this.routes[i];
if(route.match(url,content)){
var result = {
index : i,
operation : route.operation
};
return result;
}
}
var message = 'Dead end, no content found and no more route.';
var infos = {
url : url,
content : content,
routes : this.routes
};
throw new ScrapinodeError(message + 'Informations: ' + JSON.stringify(infos));
};
Parameters:
- url {String} -
- content {String} -
- index {Number} -
Returns:
- {Object} -
lib/scrapinode.js
browser = require('./browser'),
Modules dependencies
var browser = require('./browser'),
Router = require('./router'),
Route = require('./route'),
Scraper = require('./scraper'),
defaults = require('./defaults/');
module.exports = exports = new Scrapinode()
Expose an instance of Scrapinode
as a convenience mostly
module.exports = exports = new Scrapinode();
exports.createInstance()
Create an instance of Scrapinode
exports.createInstance = function(engine){
return new Scrapinode(engine);
};
Returns:
- {Scrapinode} a new instance of `Scrapinode`
Scrapinode()
Scrapinode
class
function Scrapinode(engine){
this.engine = engine || 'jsdom';
this.router = new Router();
};
Scrapinode.prototype.createScraper(options,callback)
Create a scraper for the given url
with the given engine
Scrapinode.prototype.createScraper = function (options,callback){
var self = this;
if(typeof(options) === 'string'){
var url = options;
options = {
url : url,
engine : this.engine
};
}
if(!options.engine) options.engine = this.engine;
if(!options.timeout) options.timeout = 5000;
if(!options.retries) options.retries = 3;
if(!options.redirects) options.redirects = 5;
if(!options.headers) options.headers = {};
if(!options.headers['user-agent']) options.headers['user-agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.107 Safari/535.1';
if(!options.headers['accept']) options.headers['accept'] = 'text/html, application/xhtml+xml, application/xml; q=0.9';
if(!options.headers['accept-charset']) options.headers['accept-charset'] = 'utf-8; q=1.0, ISO-8859-1; q=0.7, *; q=0.3';
if(!options.headers['accept-encoding']) options.headers['accept-encoding'] = 'gzip, deflate';
browser.load(options,function(err,window){
if(err) return callback(err);
var scraper = new Scraper(options.url,window,self.router);
callback(err,scraper);
});
};
Parameters:
- options {String|Object} - url of the page or a set of options in this case options.url is mandatory
- options.url {String} - url of the page
- [options.engine=this.engine] {String} - name of the engine (jsdom or cheerio)
- [options.html=undefined] {String} - HTML content
- [options.timeout=5000] {Number} - number of milliseconds after which the timeout is reached
- [options.retries=3] {Number} - maximum number of times the request is resent
- [options.redirects=5] {Number} - maximum number of redirections followed
- [options.headers] {Object} - HTTP headers used by the request
- callback {Function} - call when the scraper is ready for work
- callback().err {Error} -
- callback().scraper {Scraper} -
Scrapinode.prototype.use(path,content,operation)
Add the given operation
in the router for the given path
and content
Scrapinode.prototype.use = function(path,content,operation){
this.router.addRoute(new Route(path,content,operation));
return this;
};
Parameters:
- path {String|RegExp} -
- content {String|RegExp} -
- operation {Function} - function called when the path and the content match
Returns:
- {Scrapinode} itself, allow to chain methods call
Scrapinode.prototype.useAll(routes)
Add all given routes
caracterized by a path
, a content
and an operation
in the router
Scrapinode.prototype.useAll = function(routes){
var self = this;
routes.forEach(function(route){
self.router.addRoute(new Route(route.path,route.content,route.operation));
});
return self;
};
Parameters:
- routes {Array} -
- routes[].path {String|RegExp} -
- routes[].content {String|RegExp} -
- routes[].operation {Function} -
Returns:
- {Scrapinode} itself, allow to chain methods call
Scrapinode.prototype.clearRouter()
Remove all the routes available in the router
Scrapinode.prototype.clearRouter = function(){
this.router.routes = [];
return this;
};
Returns:
- {Scrapinode} itself, allow to chain methods call
Scrapinode.prototype.defaults()
Expose default routes for the contents "title", "descriptions", "videos", "images"
Scrapinode.prototype.defaults = function(){
return defaults;
};
Returns:
- {Array} routes
lib/scraper.js
module.exports = Scraper
Expose Scraper
module.exports = Scraper;
Scraper(url,window,router)
Scraper
class
function Scraper(url,window,router){
this.url = url;
this.window = window;
this.router = router;
}
Parameters:
- url {String} -
- window {Object} -
- router {Router} -
Scraper.prototype.get(content,[index])
Get the given content
from the this.router
Scraper.prototype.get = function(content,index) {
if(!index) index = 0;
var result = this.router.dispatch(this.url,content,index);
var trouvaille = result.operation(this.window);
if(trouvaille === null) return this.get(content,result.index + 1);
return trouvaille;
};
Parameters:
- content {String} -
- [index] {Number} -
Returns:
- {String|Number|Object|Boolean|null|undefined}
lib/route.js
module.exports = Route
Expose Route
module.exports = Route;
Route(path,content,operation)
Route
class
function Route(path,content,operation){
if(typeof(path) === 'string'){
if(path === '*') path = '.*';
path = new RegExp(path);
}
if(typeof(content) === 'string'){
if(content === '*') content = '.*';
content = new RegExp(content);
}
this.path = path;
this.content = content;
this.operation = operation;
}
Parameters:
- path {String|RegExp} -
- content {String|RegExp} -
- operation {Function} -
Route.prototype.match(url,content)
Attempt to match the given url
with this.path
and content
with this.content
Route.prototype.match = function(url,content){
return this.path.test(url) && this.content.test(content);
};
Parameters:
- url {String} -
- content {String} -
Returns:
- {Boolean}
lib/browser.js
jsdom = require('jsdom'),
Modules dependencies
var jsdom = require('jsdom'),
cheerio = require('cheerio'),
request = new require('superagent').agent(),
fs = require('fs'),
ScrapinodeError = require('./error/scrapinode-error'),
HTTPError = require('httperror');
jqueryExt = fs.readFileSync(__dirname + '/../deps/jquery-regex-selector.js').toString()
jQuery dependencies
var jqueryExt = fs.readFileSync(__dirname + '/../deps/jquery-regex-selector.js').toString();
var jquery = fs.readFileSync(__dirname + '/../deps/jquery-2.0.2.min.js').toString() + jqueryExt;
zepto = fs.readFileSync(__dirname + '/../deps/zepto-v1.0.js').toString()
Zepto library
var zepto = fs.readFileSync(__dirname + '/../deps/zepto-v1.0.js').toString();
exports.load(options,callback)
Build the DOM of the given page found at options.url
or options.html
exports.load = function(options,callback){
if(options.html){
return process.nextTick(function(){
buildDOM(options.html,options.engine,options.url,callback);
});
}
getRequest(options,function(err,body){
if(err) return callback(err);
buildDOM(body,options.engine,options.url,callback);
});
};
Parameters:
- options {Object} -
- callback {Function} -
- callback().err {Error} -
- callback().window {Object} -
getRequest(options,callback)
Send an HTTP GET request to options.url
function getRequest(options,callback){
var destroyed = false;
var req = request.get(options.url)
.set(options.headers)
.timeout(options.timeout)
.redirects(options.redirects)
.buffer(false)
.end(function(err,res){
if(err) return onError(err);
// Check HTTP status code
var isHTTPError = isRedirect(res.status) || isClientError(res.status) || isServerError(res.status);
if(isHTTPError) return onError(new HTTPError(res.status));
// Attach event handlers and build the body
var body = '';
res.on('data',function(chunk){
body += chunk;
});
res.on('end',function(){
if(destroyed) return;
// Check if a HTTP refresh/redirection is present into the HTML page, if yes refreshes/redirects.
var matches = body.match(/<meta[ ]*http-equiv="REFRESH"[ ]*content="[0-9]{1,};[ ]*URL=(.*?)"[ ]*\/?>/i);
if(matches && matches[1]){
options.url = matches[1];
return getRequest(options,callback);
}
callback(null,body);
});
res.on('error',onError);
// Check if content-type is an image, if yes destroy the response and build a HTML page with the image in it
if(isImage(res.headers)){
res.destroy();
destroyed = true;
body = '<!DOCTYPE html><html><head></head><body><img src="' + options.url + '" /></body></html>';
return callback(null,body);
}
});
// Error event handler
function onError(err){
if(options.retries--) return getRequest(options,callback);
callback(err);
}
}
Parameters:
- options {Object} - configuration of the HTTP GET request
- options.headers {String} - set of headers
- options.timeout {Number} - timeout
- options.redirects {Number} - number of times the request will follow redirection instructions
- options.retries {Number} - number of times the request will be resend if the request failed
- callback {Function} -
buildDOM(body,engine,url,callback)
Build a DOM representation of the given HTML body
function buildDOM(body,engine,url,callback){
if(!body){
return callback(new ScrapinodeError('The HTTP response contains an empty body: "' + body +'"'));
}
if(engine === 'jsdom' || engine === 'jsdom+zepto'){
var library = engine === 'jsdom+zepto' ? zepto : jquery;
try{
jsdom.env({
html: body,
src : [library],
done : function(err,window){
if(err) return callback(err);
if(!window) return callback(new ScrapinodeError('The "window" provides by JSDOM is falsy: ' + window));
window.location.href = url;
callback(err,window);
window.close();
}
});
}catch(err){
callback(err);
}
}else if(engine === 'cheerio'){
try{
var $ = cheerio.load(body);
}catch(err){
callback(err);
}
var window = {
$ : $,
location : {
href : url
}
};
callback(null,window);
}else{
callback(new ScrapinodeError('The engine "' + engine + '" is not supported. Scrapinode only supports jsdom and cheerio.'));
}
}
Parameters:
- body {String} - html page
- engine {String} - name of the engine used to generate the DOM
- url {String} - url of the page containing the given `body`
- callback {Function} -
isRedirect(code)
Check if the code
is a HTTP redirection status.
function isRedirect(code) {
return (code >= 300 && code < 399);
}
Parameters:
- code {Number}
Returns:
- {Boolean}
isClientError(code)
Check if the code
is a HTTP client error status.
function isClientError(code){
return (code >= 400 && code < 499);
}
Parameters:
- code {Number}
Returns:
- {Boolean}
isServerError(code)
Check if the code
is a HTTP server error status.
function isServerError(code){
return (code >= 500 && code < 599);
}
Parameters:
- code {Number}
Returns:
- {Boolean}
isImage(headers)
Check if the content of the HTTP body is an image
function isImage(headers){
var regexImage = /image\//i;
var contentType = headers ? headers['content-type'] : '';
return regexImage.test(contentType);
}
Parameters:
- headers {Object} -
Returns:
- {Boolean}
lib/utils/index.js
url = require('url'),
Module dependencies
var url = require('url'),
domains = require('./domains');
exports.isURL(path)
Check if the given path
is an URL
exports.isURL = function isURL(path) {
var regex = /(https?:)?\/\/([\-\w\.]+)+/i;
return regex.test(path);
};
Parameters:
- path {String} -
Returns:
- {Boolean}
exports.toURL(path,uri)
Convert a relative/absolute path into an URL
exports.toURL = function toURL(path,uri){
var absolutePath = path;
if(!exports.isURL(path)){
var explodeURL = url.parse(uri);
// 2 cases: absolute path and relative path to the current pathname
if( path.charAt(0) === '/'){
absolutePath = explodeURL.protocol + '//' + explodeURL.host + path;
}else{
var explodePathname = explodeURL.pathname.split('/');
var pathname = explodePathname.slice(0,explodePathname.length - 1).join('/');
absolutePath = explodeURL.protocol + '//' + (explodeURL.host + '/' +pathname + '/' + path).replace('\/\/','/','g');
}
}
return absolutePath;
};
Parameters:
- path {String} - relative/absolute path to a resource
- uri {String} - url
Returns:
- {String}
exports.inline(text)
Inline the text, remove characters "/n" and " " (space)
exports.inline = function inline(text){
var explode = text.split('\n').join('').split(' ');
var size = explode.length;
for(var i=0; i < size ; i++){
if(!explode[i]){
explode.splice(i,1);
i--;
size--;
}
}
return explode.join(' ').trim();
};
Parameters:
- text {String} -
Returns:
- {String}
exports.getWebsiteName(uri)
Get the name of the website from an uri
exports.getWebsiteName = function getWebsiteName(uri){
var hostname = url.parse(uri).hostname;
var name = '';
if(hostname){
var subdomains = domains;
var components = hostname.split('.');
for(var i = components.length -1 ; i >= 0; i-- ){
if(subdomains[components[i]]){
subdomains = subdomains[components[i]];
}else{
name = components[i];
break;
}
}
}
return name;
};
Parameters:
- uri {String}
Returns:
- {String} - name of the website
lib/error/scrapinode-error.js
util = require('util')
Module dependencies
var util = require('util');
module.exports = ScrapinodeError
Expose ScrapinodeError
module.exports = ScrapinodeError;
ScrapinodeError()
Create a new ScrapinodeError
function ScrapinodeError(message){
Error.call(this);
Error.captureStackTrace(this,arguments.callee);
this.name = 'ScrapinodeError';
this.message = message;
}
util.inherits(ScrapinodeError,Error);
lib/defaults/index.js
utils = require('./../utils/')
Module dependencies
var utils = require('./../utils/');
module.exports = exports = [
Expose default routes each route is composed of a path, a content name and an operation
module.exports = exports = [
{
path : '*',
content : 'descriptions',
operation : scrapDescription
},
{
path : '*',
content : 'title',
operation : scrapTitle
},
{
path : '*',
content : 'images',
operation : scrapImage
},
{
path : '*',
content : 'videos',
operation : scrapVideo
}
];
scrapDescription(window)
Retrieve descriptions of the page
function scrapDescription(window){
var $ = window.$;
var url = window.url;
var descriptions = [];
// Open Graph protocol by Facebook <meta property="og:description" content="(*)"/>
$('meta[property="og:description"]').each(function(){
var content = $(this).attr('content');
if(content) descriptions.push(content);
});
// Schema.org : <* itemprop="description">(*)</*>
$('[itemprop="description"]').each(function(){
var text = $(this).text();
if(text) descriptions.push(text);
});
// Meta tag description: <meta property="description" content="(*)" />
$('meta[name="description"]').each(function(){
var description = utils.inline($(this).attr('content')).trim();
if(description) descriptions.push(description);
});
// Random text in div and p tags. Oriented product informations
if(descriptions.length === 0){
$('div,p').each(function(){
if( ($(this).attr('class') && $(this).attr('class').toLowerCase() === 'productdesc') || ($(this).attr('id') && $(this).attr('id').toLowerCase() === 'productdesc')){
var description = utils.inline($(this).text()).trim();
if(description) descriptions.push(description);
}
});
}
return descriptions;
}
Parameters:
- window {Object} - object representating the window
Returns:
- {Array}
isValidExtension(src)
Check if the extension is considered valid
function isValidExtension(src){
var extension = src.split('.').pop();
var isValid = ENUM_INVALID_EXTENSIONS[extension] === false ? false : true;
return isValid;
}
Parameters:
- src {String} - url of the image
Returns:
- {Boolean} true if valid, false otherwise
scrapImage(window)
Retrieve image urls on the page
function scrapImage(window){
var $ = window.$;
var url = window.url;
var thumbs = [];
var thumbsRejected = [];
var title = scrapTitle(window);
var addToThumbs = function(image,beginning){
var src = $(image).attr('src');
if(src && isValidExtension(src) ){
src = utils.toURL(src,url);
if(beginning){
thumbs.unshift(src);
}else{
thumbs.push(src);
}
}else if(src){
thumbsRejected.push(src);
}
};
// Open Graph protocol by Facebook: <meta property="og:image" content="(*)"/>
$('meta[property="og:image"]').each(function(){
var content = $(this).attr('content');
if(content) thumbs.push(utils.toURL(content));
});
// Schema.org: <img itemprop="image" src="(*)"/>
$('img[itemprop="image"]').each(function(){
addToThumbs(this);
});
// Oriented product informations
if(thumbs.length < 1){
$('img[id*="product"]').each(function(){
addToThumbs(this);
});
$('img[class*="product"]').each(function(){
addToThumbs(this);
});
}
// Grab all images
if(thumbs.length < 10){
$('img').each(function(){
if($(this).attr('itemprop') === 'image') return;
var alt = $(this).attr('alt');
// Leave this test alone
// the selector 'img[alt="title"]' will not work if the title is like LG 42PT35342" PLASMA TV. Escaping issues.
// Image where the title of the page is equal to the content of the alt attribute of the image tag.
if(alt === title){
addToThumbs(this,true);
}else{
addToThumbs(this);
}
});
}
if(thumbs.length === 0){
thumbs = thumbsRejected;
}
return thumbs;
}
Parameters:
- window {Object} -
Returns:
- {Array}
scrapTitle(window)
Retrieve the more appropriate title of the page
function scrapTitle(window){
var $ = window.$;
var url = window.location.href;
// Tags or attributes whom can contain a nice title for the page
var titleTag = $('title').text().trim();
var metaTitleTag = $('meta[name="title"]').attr('content');
var openGraphTitle = $('meta[property="og:title"]').attr('content');
var h1Tag = $('h1').eq(0).text().trim();
var itempropNameTag = $('[itemprop="name"]').text().trim();
var titles = [titleTag, metaTitleTag, openGraphTitle, h1Tag, itempropNameTag];
// Regex of the web site name
var nameWebsite = utils.getWebsiteName(url);
var regex = new RegExp(nameWebsite,'i');
// Sort to find the best title
var titlesNotEmpty = titles.filter(function(value){
return !!value;
});
var titlesBest = titlesNotEmpty.filter(function(value){
return !regex.test(value);
});
var bestTitle = (titlesBest && titlesBest[0]) || (titlesNotEmpty && titlesNotEmpty[0]) || '';
return utils.inline(bestTitle);
}
Parameters:
- window {Object} -
Returns:
- {String} title of the page
scrapVideo(window)
Retrieve the video urls on the page
function scrapVideo(window){
var $ = window.$;
var url = window.location.href;
var thumbs = [];
// Open Graph protocol by Facebook: <meta property="og:video" content="(*)"/>
$('meta').each(function(){
var property = $(this).attr('property');
var content = $(this).attr('content');
if(property === 'og:video' && content){
thumbs.push(utils.toURL(content));
}
});
$('video, embed').each(function(){
var src = $(this).attr('src');
if(src) thumbs.push(utils.toURL(src,url));
});
return thumbs;
}
Parameters:
- window {Object} -
Returns:
- {Array}
Licence
(The MIT License)
Copyright (c) 2013 Rémy Loubradou
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.