scrapinode

content driven and route based scraper

version 0.2.0 by Remy Loubradou licensed under MIT


Status:
Build Status Dependency Status 

When use it?

When you want to retrieve information about the page pointed by the URL that your user has just copied, scrapinode is a great fit. First scrapinode comes out of the box with a great feature of 1 line of code, that will give you the title, the description and the images of any HTML page on the web. Second if you need more, you can extend it. See the examples to know more about it.

Features

  • Retrieve content like "title", "descriptions", "images", "videos" on any HTML pages with 1 line of code.
  • Define specific operations based on the url of the page and the content you hope retrieve with regex.
  • Scrape pages with jsdom + jquery or with cheerio.
  • The HTTP client takes care to handle HTTP and HTML redirections.
  • Scrape image like it was a HTML page.

Install

npm install scrapinode

Usage

var scrapinode = require('scrapinode');

// Define an operation for a specific route and content
scrapinode.use('society6.com','title',function(window){
  var $ = window.$;
  var url = window.location.href; // url of the page maybe you want to check for some reasons
  var title = $('h1[itemprop="name"]').text();
  if(!title) return null;
  return title;
});

// Use default operations for content like "title", "descriptions", "images", "videos"
scrapinode.useAll(scrapinode.defaults());

scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Test

npm test

Test coverage

make coverage

Examples

Define a new operation for a route

A route is caracterized by a content and path (there are regex or string), also an operation is attached to each route.

var scrapinode = require('./../');

scrapinode.use(/http:\/\/society6\.com/,'title',function(window){
	var $ = window.$;
	var title = $('h1[itemprop="name"]').text();
	return title;
});

scrapinode.use(/http:\/\/society6\.com/,'artist',function(window){
	var $ = window.$;
	var artist = $('.details>h3>a').text();
	return artist;
});

scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   var artist = scraper.get('artist');
   console.log(title,artist); // "Sounds Good Dude Chase Kunz"
});

scrapinode.createScraper('http://society6.com/product/the-lord-of-fashion_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   var artist = scraper.get('artist');
   console.log(title,artist); // "the lord of fashion H A P P Y J O Y"
});

Terminology

If you wonder what I mean by "path", "content", "operation" and "route" this is your answer.

var scrapinode = require('./..');

var path = /http:\/\/society6.com/;
var content = 'title';
// route is association of a "path" and a "content"
// each route has its "operation" associated
var operation = function(window){
	var $ = window.$;
	var title = $('h1[itemprop="name"]').text();
	return title;
};

scrapinode.use(path,content,operation);

scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Use jsdom and jquery

var scrapinode = require('./../');

scrapinode.use(/http:\/\/society6.com/,'title',function(window){
	var $ = window.$;
	var title = $('h1[itemprop="name"]').text();
	return title;
});

var options = {
	url : 'http://society6.com/product/Sounds-Good-Dude_T-shirt',
	engine : 'jsdom'
};

scrapinode.createScraper(options,function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Get the title and the description

var scrapinode = require('./../');

scrapinode.useAll(scrapinode.defaults());

scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   var description = scraper.get('descriptions')[0];
   console.log(title,description);
   // "Sound Good dude American Apparel T-shirts are made with 100% fine jersey cotton combed for softness and comfort.
   // (Athletic Grey and Athletic Blue contain 50% polyester / 25% cotton / 25% rayon)"
});

Use jsdom and zepto

var scrapinode = require('./../');

scrapinode.use(/http:\/\/society6.com/,'title',function(window){
	var $ = window.$;
	var title = $('h1[itemprop="name"]').text();
	return title;
});

var options = {
	url : 'http://society6.com/product/Sounds-Good-Dude_T-shirt',
	engine : 'jsdom+zepto'
};

scrapinode.createScraper(options,function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Take advantage of the middleware system

The first route/operation used in the stack (router) should be specific and the others less and less specific, more generic.

var scrapinode = require('./../');

scrapinode.use(/http:\/\/society6\.com/,'title',function(window){
	var $ = window.$;
   // this selector is really appropriate for product pages not for the index page
	var title = $('h1[itemprop="name"]').text();
   // if title is falsy, the operation will return "null",
   // which means that the next matching route will be used
   // and so its associated operation will be called with the same "window"
   // this logic goes on and on until an operation returns something else than "null"
   console.log('see the title here is empty: "' + title + '"');
   if(!title) return null;
	return title;
});

scrapinode.use('*','title',function(window){
	var $ = window.$;
   var title = $('title').text();
   return title;
});

scrapinode.createScraper('http://society6.com',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Society6 | Affordable Art Prints, iPhone Cases and T-shirts"
});

Use cheerio

var scrapinode = require('./../');

scrapinode.use(/http:\/\/society6.com/,'title',function(window){
	var $ = window.$;
	var title = $('h1[itemprop="name"]').text();
	return title;
});

var options = {
	url : 'http://society6.com/product/Sounds-Good-Dude_T-shirt',
	engine : 'cheerio'
};

scrapinode.createScraper(options,function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Simple usage

Scrap and get the title of the page using the default operators.

var scrapinode = require('./../');

scrapinode.useAll(scrapinode.defaults());

scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Code documentation

index.js

module.exports = require('./lib/scrapinode')

Expose the scrapinode library


module.exports = require('./lib/scrapinode');

lib/router.js

ScrapinodeError = require('./error/scrapinode-error')

Module dependencies


var ScrapinodeError = require('./error/scrapinode-error');

module.exports = Router

Expose Router


module.exports = Router;

Router()

Router class


function Router(){
	this.routes = [];
}

Router.prototype.addRoute(route)

Add the given route into this.routes


Router.prototype.addRoute = function(route){
	this.routes.push(route);
	return this;
};

Parameters:

  • route {Route} -

Returns:

  • {Router}

Router.prototype.dispatch(url,content,index)

Find the relevant operation for the given url and content expected


Router.prototype.dispatch = function(url,content,index){
	for(var i = index; i < this.routes.length; i++){
		var route = this.routes[i];
		if(route.match(url,content)){
			var result = {
				index : i,
				operation : route.operation
			};
			return result;
		}
	}

	var message = 'Dead end, no content found and no more route.';
	var infos = {
		url : url,
		content : content,
		routes : this.routes
	};
	throw new ScrapinodeError(message + 'Informations: ' + JSON.stringify(infos));
};

Parameters:

  • url {String} -
  • content {String} -
  • index {Number} -

Returns:

  • {Object} -

lib/scrapinode.js

browser = require('./browser'),

Modules dependencies


var browser = require('./browser'),
  Router = require('./router'),
  Route = require('./route'),
  Scraper = require('./scraper'),
  defaults = require('./defaults/');

module.exports = exports = new Scrapinode()

Expose an instance of Scrapinode as a convenience mostly


module.exports = exports = new Scrapinode();

exports.createInstance()

Create an instance of Scrapinode


exports.createInstance = function(engine){
  return new Scrapinode(engine);
};

Returns:

  • {Scrapinode} a new instance of `Scrapinode`

Scrapinode()

Scrapinode class


function Scrapinode(engine){
  this.engine = engine || 'jsdom';
  this.router = new Router();
};

Scrapinode.prototype.createScraper(options,callback)

Create a scraper for the given url with the given engine


Scrapinode.prototype.createScraper = function (options,callback){
  var self = this;
  if(typeof(options) === 'string'){
    var url = options;
    options = {
      url : url,
      engine : this.engine
    };
  }

  if(!options.engine) options.engine = this.engine;
  if(!options.timeout) options.timeout = 5000;
  if(!options.retries) options.retries = 3;
  if(!options.redirects) options.redirects = 5;
  if(!options.headers) options.headers = {};
  if(!options.headers['user-agent']) options.headers['user-agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.107 Safari/535.1';
  if(!options.headers['accept']) options.headers['accept'] = 'text/html, application/xhtml+xml, application/xml; q=0.9';
  if(!options.headers['accept-charset']) options.headers['accept-charset'] = 'utf-8; q=1.0, ISO-8859-1; q=0.7, *; q=0.3';
  if(!options.headers['accept-encoding']) options.headers['accept-encoding'] = 'gzip, deflate';

  browser.load(options,function(err,window){
    if(err) return callback(err);
    var scraper = new Scraper(options.url,window,self.router);
    callback(err,scraper);
  });
};

Parameters:

  • options {String|Object} - url of the page or a set of options in this case options.url is mandatory
  • options.url {String} - url of the page
  • [options.engine=this.engine] {String} - name of the engine (jsdom or cheerio)
  • [options.html=undefined] {String} - HTML content
  • [options.timeout=5000] {Number} - number of milliseconds after which the timeout is reached
  • [options.retries=3] {Number} - maximum number of times the request is resent
  • [options.redirects=5] {Number} - maximum number of redirections followed
  • [options.headers] {Object} - HTTP headers used by the request
  • callback {Function} - call when the scraper is ready for work
  • callback().err {Error} -
  • callback().scraper {Scraper} -

Scrapinode.prototype.use(path,content,operation)

Add the given operation in the router for the given path and content


Scrapinode.prototype.use = function(path,content,operation){
  this.router.addRoute(new Route(path,content,operation));
  return this;
};

Parameters:

  • path {String|RegExp} -
  • content {String|RegExp} -
  • operation {Function} - function called when the path and the content match

Returns:

  • {Scrapinode} itself, allow to chain methods call

Scrapinode.prototype.useAll(routes)

Add all given routes caracterized by a path, a content and an operation in the router


Scrapinode.prototype.useAll = function(routes){
  var self = this;
  routes.forEach(function(route){
    self.router.addRoute(new Route(route.path,route.content,route.operation));
  });
  return self;
};

Parameters:

  • routes {Array} -
  • routes[].path {String|RegExp} -
  • routes[].content {String|RegExp} -
  • routes[].operation {Function} -

Returns:

  • {Scrapinode} itself, allow to chain methods call

Scrapinode.prototype.clearRouter()

Remove all the routes available in the router


Scrapinode.prototype.clearRouter = function(){
  this.router.routes = [];
  return this;
};

Returns:

  • {Scrapinode} itself, allow to chain methods call

Scrapinode.prototype.defaults()

Expose default routes for the contents "title", "descriptions", "videos", "images"


Scrapinode.prototype.defaults = function(){
  return defaults;
};

Returns:

  • {Array} routes

lib/scraper.js

module.exports = Scraper

Expose Scraper


module.exports = Scraper;

Scraper(url,window,router)

Scraper class


function Scraper(url,window,router){
	this.url = url;
	this.window = window;
	this.router = router;
}

Parameters:

  • url {String} -
  • window {Object} -
  • router {Router} -

Scraper.prototype.get(content,[index])

Get the given content from the this.router


Scraper.prototype.get = function(content,index) {
	if(!index) index = 0;
	var result = this.router.dispatch(this.url,content,index);
	var trouvaille = result.operation(this.window);
	if(trouvaille === null) return this.get(content,result.index + 1);
	return trouvaille;
};

Parameters:

  • content {String} -
  • [index] {Number} -

Returns:

  • {String|Number|Object|Boolean|null|undefined}

lib/route.js

module.exports = Route

Expose Route


module.exports = Route;

Route(path,content,operation)

Route class


function Route(path,content,operation){
	if(typeof(path) === 'string'){
		if(path === '*') path = '.*';
		path = new RegExp(path);
	}
	if(typeof(content) === 'string'){
		if(content === '*') content = '.*';
		content = new RegExp(content);
	}
	this.path = path;
	this.content = content;
	this.operation = operation;
}

Parameters:

  • path {String|RegExp} -
  • content {String|RegExp} -
  • operation {Function} -

Route.prototype.match(url,content)

Attempt to match the given url with this.path and content with this.content


Route.prototype.match = function(url,content){
	return this.path.test(url) && this.content.test(content);
};

Parameters:

  • url {String} -
  • content {String} -

Returns:

  • {Boolean}

lib/browser.js

jsdom = require('jsdom'),

Modules dependencies


var jsdom = require('jsdom'),
    cheerio = require('cheerio'),
    request = new require('superagent').agent(),
    fs = require('fs'),
    ScrapinodeError = require('./error/scrapinode-error'),
    HTTPError = require('httperror');

jqueryExt = fs.readFileSync(__dirname + '/../deps/jquery-regex-selector.js').toString()

jQuery dependencies


var jqueryExt = fs.readFileSync(__dirname + '/../deps/jquery-regex-selector.js').toString();
var jquery = fs.readFileSync(__dirname + '/../deps/jquery-2.0.2.min.js').toString() + jqueryExt;

zepto = fs.readFileSync(__dirname + '/../deps/zepto-v1.0.js').toString()

Zepto library


var zepto = fs.readFileSync(__dirname + '/../deps/zepto-v1.0.js').toString();

exports.load(options,callback)

Build the DOM of the given page found at options.url or options.html


exports.load = function(options,callback){

    if(options.html){
        return process.nextTick(function(){
            buildDOM(options.html,options.engine,options.url,callback);
        });
    }

    getRequest(options,function(err,body){
        if(err) return callback(err);
        buildDOM(body,options.engine,options.url,callback);
    });
};

Parameters:

  • options {Object} -
  • callback {Function} -
  • callback().err {Error} -
  • callback().window {Object} -

getRequest(options,callback)

Send an HTTP GET request to options.url


function getRequest(options,callback){
    var destroyed = false;
    var req = request.get(options.url)
        .set(options.headers)
        .timeout(options.timeout)
        .redirects(options.redirects)
        .buffer(false)
        .end(function(err,res){
            if(err) return onError(err);

            // Check HTTP status code
            var isHTTPError = isRedirect(res.status) || isClientError(res.status) || isServerError(res.status);
            if(isHTTPError) return onError(new HTTPError(res.status));

            // Attach event handlers and build the body
            var body = '';
            res.on('data',function(chunk){
                body += chunk;
            });
            res.on('end',function(){
                if(destroyed) return;
                // Check if a HTTP refresh/redirection is present into the HTML page, if yes refreshes/redirects.
                var matches = body.match(/<meta[ ]*http-equiv="REFRESH"[ ]*content="[0-9]{1,};[ ]*URL=(.*?)"[ ]*\/?>/i);
                if(matches && matches[1]){
                    options.url = matches[1];
                    return getRequest(options,callback);
                }
                callback(null,body);
            });
            res.on('error',onError);

            // Check if content-type is an image, if yes destroy the response and build a HTML page with the image in it
            if(isImage(res.headers)){
                res.destroy();
                destroyed = true;
                body = '<!DOCTYPE html><html><head></head><body><img src="' + options.url + '" /></body></html>';
                return callback(null,body);
            }
        });

    // Error event handler
    function onError(err){
        if(options.retries--) return getRequest(options,callback);
        callback(err);
    }
}

Parameters:

  • options {Object} - configuration of the HTTP GET request
  • options.headers {String} - set of headers
  • options.timeout {Number} - timeout
  • options.redirects {Number} - number of times the request will follow redirection instructions
  • options.retries {Number} - number of times the request will be resend if the request failed
  • callback {Function} -

buildDOM(body,engine,url,callback)

Build a DOM representation of the given HTML body


function buildDOM(body,engine,url,callback){
    if(!body){
        return callback(new ScrapinodeError('The HTTP response contains an empty body: "' + body +'"'));
    }

    if(engine === 'jsdom' || engine === 'jsdom+zepto'){
        var library = engine === 'jsdom+zepto' ? zepto : jquery;
        try{
            jsdom.env({
               html: body,
               src : [library],
               done : function(err,window){
                   if(err) return callback(err);
                   if(!window) return callback(new ScrapinodeError('The "window" provides by JSDOM is falsy: ' + window));
                   window.location.href = url;
                   callback(err,window);
                   window.close();
               }
            });
        }catch(err){
            callback(err);
        }
    }else if(engine === 'cheerio'){
        try{
            var $ = cheerio.load(body);
        }catch(err){
            callback(err);
        }
        var window = {
            $ : $,
            location : {
                href : url
            }
        };
        callback(null,window);
    }else{
        callback(new ScrapinodeError('The engine "' + engine + '" is not supported. Scrapinode only supports jsdom and cheerio.'));
    }
}

Parameters:

  • body {String} - html page
  • engine {String} - name of the engine used to generate the DOM
  • url {String} - url of the page containing the given `body`
  • callback {Function} -

isRedirect(code)

Check if the code is a HTTP redirection status.


function isRedirect(code) {
  return (code >= 300 && code < 399);
}

Parameters:

  • code {Number}

Returns:

  • {Boolean}

isClientError(code)

Check if the code is a HTTP client error status.


function isClientError(code){
    return (code >= 400 && code < 499);
}

Parameters:

  • code {Number}

Returns:

  • {Boolean}

isServerError(code)

Check if the code is a HTTP server error status.


function isServerError(code){
    return (code >= 500 && code < 599);
}

Parameters:

  • code {Number}

Returns:

  • {Boolean}

isImage(headers)

Check if the content of the HTTP body is an image


function isImage(headers){
    var regexImage = /image\//i;
    var contentType = headers ?  headers['content-type'] : '';
    return regexImage.test(contentType);
}

Parameters:

  • headers {Object} -

Returns:

  • {Boolean}

lib/utils/index.js

url = require('url'),

Module dependencies


var url = require('url'),
   domains = require('./domains');

exports.isURL(path)

Check if the given path is an URL


exports.isURL = function isURL(path) {
   var regex = /(https?:)?\/\/([\-\w\.]+)+/i;
   return regex.test(path);
};

Parameters:

  • path {String} -

Returns:

  • {Boolean}

exports.toURL(path,uri)

Convert a relative/absolute path into an URL


exports.toURL = function toURL(path,uri){
   var absolutePath = path;
   if(!exports.isURL(path)){
      var explodeURL = url.parse(uri);
      // 2 cases: absolute path and relative path to the current pathname
      if( path.charAt(0) === '/'){
         absolutePath = explodeURL.protocol + '//' + explodeURL.host + path;
      }else{
         var explodePathname = explodeURL.pathname.split('/');
         var pathname = explodePathname.slice(0,explodePathname.length - 1).join('/');
         absolutePath = explodeURL.protocol + '//' + (explodeURL.host + '/' +pathname + '/' + path).replace('\/\/','/','g');
      }
   }
   return absolutePath;
};

Parameters:

  • path {String} - relative/absolute path to a resource
  • uri {String} - url

Returns:

  • {String}

exports.inline(text)

Inline the text, remove characters "/n" and " " (space)


exports.inline = function inline(text){
   var explode = text.split('\n').join('').split(' ');
   var size = explode.length;
   for(var i=0; i < size ; i++){
      if(!explode[i]){
         explode.splice(i,1);
         i--;
         size--;
      }
   }
   return explode.join(' ').trim();
};

Parameters:

  • text {String} -

Returns:

  • {String}

exports.getWebsiteName(uri)

Get the name of the website from an uri


exports.getWebsiteName = function getWebsiteName(uri){
   var hostname = url.parse(uri).hostname;
   var name = '';
   if(hostname){
      var subdomains = domains;
      var components = hostname.split('.');
      for(var i = components.length -1 ; i >=  0; i-- ){
         if(subdomains[components[i]]){
            subdomains = subdomains[components[i]];
         }else{
            name = components[i];
            break;
         }
      }
   }
   return name;
};

Parameters:

  • uri {String}

Returns:

  • {String} - name of the website

lib/error/scrapinode-error.js

util = require('util')

Module dependencies


var util = require('util');

module.exports = ScrapinodeError

Expose ScrapinodeError


module.exports = ScrapinodeError;

ScrapinodeError()

Create a new ScrapinodeError


function ScrapinodeError(message){
   Error.call(this);
   Error.captureStackTrace(this,arguments.callee);
   this.name = 'ScrapinodeError';
   this.message = message;
}

util.inherits(ScrapinodeError,Error);

lib/defaults/index.js

utils = require('./../utils/')

Module dependencies


var utils = require('./../utils/');

module.exports = exports = [

Expose default routes each route is composed of a path, a content name and an operation


module.exports = exports = [
   {
      path : '*',
      content : 'descriptions',
      operation : scrapDescription
   },
   {
      path : '*',
      content : 'title',
      operation : scrapTitle
   },
   {
      path : '*',
      content : 'images',
      operation : scrapImage
   },
   {
      path : '*',
      content : 'videos',
      operation : scrapVideo
   }
];

scrapDescription(window)

Retrieve descriptions of the page


function scrapDescription(window){
   var $ = window.$;
   var url = window.url;
   var descriptions = [];

   // Open Graph protocol by Facebook <meta property="og:description" content="(*)"/>
   $('meta[property="og:description"]').each(function(){
      var content = $(this).attr('content');
      if(content) descriptions.push(content);
   });

   // Schema.org : <* itemprop="description">(*)</*>
   $('[itemprop="description"]').each(function(){
      var text = $(this).text();
      if(text) descriptions.push(text);
   });

   // Meta tag description: <meta property="description" content="(*)" />
   $('meta[name="description"]').each(function(){
      var description = utils.inline($(this).attr('content')).trim();
      if(description) descriptions.push(description);
   });

   // Random text in div and p tags. Oriented product informations
   if(descriptions.length === 0){
      $('div,p').each(function(){
         if( ($(this).attr('class') && $(this).attr('class').toLowerCase() === 'productdesc') || ($(this).attr('id') && $(this).attr('id').toLowerCase() === 'productdesc')){
            var description = utils.inline($(this).text()).trim();
            if(description) descriptions.push(description);
         }
      });
   }
   return descriptions;
}

Parameters:

  • window {Object} - object representating the window

Returns:

  • {Array}

isValidExtension(src)

Check if the extension is considered valid


function isValidExtension(src){
   var extension = src.split('.').pop();
   var isValid = ENUM_INVALID_EXTENSIONS[extension] === false ? false : true;
   return isValid;
}

Parameters:

  • src {String} - url of the image

Returns:

  • {Boolean} true if valid, false otherwise

scrapImage(window)

Retrieve image urls on the page


function scrapImage(window){
   var $ = window.$;
   var url = window.url;

   var thumbs = [];
   var thumbsRejected = [];
   var title = scrapTitle(window);
   var addToThumbs = function(image,beginning){
      var src = $(image).attr('src');
      if(src && isValidExtension(src) ){
         src = utils.toURL(src,url);
         if(beginning){
            thumbs.unshift(src);
         }else{
            thumbs.push(src);
         }
      }else if(src){
         thumbsRejected.push(src);
      }
   };
   // Open Graph protocol by Facebook: <meta property="og:image" content="(*)"/>
   $('meta[property="og:image"]').each(function(){
      var content = $(this).attr('content');
      if(content) thumbs.push(utils.toURL(content));
   });

   // Schema.org: <img itemprop="image" src="(*)"/>
   $('img[itemprop="image"]').each(function(){
      addToThumbs(this);
   });

   // Oriented product informations
   if(thumbs.length < 1){
      $('img[id*="product"]').each(function(){
          addToThumbs(this);
      });

      $('img[class*="product"]').each(function(){
          addToThumbs(this);
      });
   }

   // Grab all images
   if(thumbs.length < 10){
      $('img').each(function(){
         if($(this).attr('itemprop') === 'image') return;
         var alt = $(this).attr('alt');
         // Leave this test alone
         // the selector 'img[alt="title"]' will not work if the title is like LG 42PT35342" PLASMA TV. Escaping issues.
         // Image where the title of the page is equal to the content of the alt attribute of the image tag.
         if(alt === title){
            addToThumbs(this,true);
         }else{
            addToThumbs(this);
         }
      });
   }

   if(thumbs.length === 0){
      thumbs = thumbsRejected;
   }

   return thumbs;
}

Parameters:

  • window {Object} -

Returns:

  • {Array}

scrapTitle(window)

Retrieve the more appropriate title of the page


function scrapTitle(window){
   var $ = window.$;
   var url = window.location.href;

   // Tags or attributes whom can contain a nice title for the page
   var titleTag =  $('title').text().trim();
   var metaTitleTag = $('meta[name="title"]').attr('content');
   var openGraphTitle = $('meta[property="og:title"]').attr('content');
   var h1Tag = $('h1').eq(0).text().trim();
   var itempropNameTag = $('[itemprop="name"]').text().trim();
   var titles = [titleTag, metaTitleTag, openGraphTitle, h1Tag, itempropNameTag];

   // Regex of the web site name
   var nameWebsite = utils.getWebsiteName(url);
   var regex = new RegExp(nameWebsite,'i');
   // Sort to find the best title
   var titlesNotEmpty = titles.filter(function(value){
      return !!value;
   });
   var titlesBest = titlesNotEmpty.filter(function(value){
      return !regex.test(value);
   });
   var bestTitle = (titlesBest && titlesBest[0]) || (titlesNotEmpty && titlesNotEmpty[0]) || '';
   return utils.inline(bestTitle);
}

Parameters:

  • window {Object} -

Returns:

  • {String} title of the page

scrapVideo(window)

Retrieve the video urls on the page


function scrapVideo(window){
   var $ = window.$;
   var url = window.location.href;
   var thumbs = [];

   // Open Graph protocol by Facebook: <meta property="og:video" content="(*)"/>
   $('meta').each(function(){
      var property = $(this).attr('property');
      var content = $(this).attr('content');
      if(property === 'og:video' && content){
         thumbs.push(utils.toURL(content));
      }
   });

   $('video, embed').each(function(){
      var src = $(this).attr('src');
      if(src) thumbs.push(utils.toURL(src,url));
   });

   return thumbs;
}

Parameters:

  • window {Object} -

Returns:

  • {Array}

Licence

(The MIT License)

Copyright (c) 2013 Rémy Loubradou

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.