scrapinode

When use it?

When you want to retrieve information about the page pointed by the URL that your user has just copied, scrapinode is a great fit. First scrapinode comes out of the box with a great feature of 1 line of code, that will give you the title, the description and the images of any HTML page on the web. Second if you need more, you can extend it. See the examples to know more about it.

Features

Retrieve content like "title", "descriptions", "images", "videos" on any HTML pages with 1 line of code.
Define specific operations based on the url of the page and the content you hope retrieve with regex.
Scrape pages with jsdom + jquery or with cheerio.
The HTTP client takes care to handle HTTP and HTML redirections.
Scrape image like it was a HTML page.

Install

npm install scrapinode

Usage

var scrapinode = require('scrapinode');

// Define an operation for a specific route and content
scrapinode.use('society6.com','title',function(window){
  var $ = window.$;
  var url = window.location.href; // url of the page maybe you want to check for some reasons
  var title = $('h1[itemprop="name"]').text();
  if(!title) return null;
  return title;
});

// Use default operations for content like "title", "descriptions", "images", "videos"
scrapinode.useAll(scrapinode.defaults());

scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Test

npm test

Test coverage

make coverage

Examples

Define a new operation for a route

A route is caracterized by a content and path (there are regex or string), also an operation is attached to each route.

var scrapinode = require('./../');

scrapinode.use(/http:\/\/society6\.com/,'title',function(window){
	var $ = window.$;
	var title = $('h1[itemprop="name"]').text();
	return title;
});

scrapinode.use(/http:\/\/society6\.com/,'artist',function(window){
	var $ = window.$;
	var artist = $('.details>h3>a').text();
	return artist;
});

scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   var artist = scraper.get('artist');
   console.log(title,artist); // "Sounds Good Dude Chase Kunz"
});

scrapinode.createScraper('http://society6.com/product/the-lord-of-fashion_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   var artist = scraper.get('artist');
   console.log(title,artist); // "the lord of fashion H A P P Y J O Y"
});

Terminology

If you wonder what I mean by "path", "content", "operation" and "route" this is your answer.

var scrapinode = require('./..');

var path = /http:\/\/society6.com/;
var content = 'title';
// route is association of a "path" and a "content"
// each route has its "operation" associated
var operation = function(window){
	var $ = window.$;
	var title = $('h1[itemprop="name"]').text();
	return title;
};

scrapinode.use(path,content,operation);

scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Use jsdom and jquery

var scrapinode = require('./../');

scrapinode.use(/http:\/\/society6.com/,'title',function(window){
	var $ = window.$;
	var title = $('h1[itemprop="name"]').text();
	return title;
});

var options = {
	url : 'http://society6.com/product/Sounds-Good-Dude_T-shirt',
	engine : 'jsdom'
};

scrapinode.createScraper(options,function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Get the title and the description

var scrapinode = require('./../');

scrapinode.useAll(scrapinode.defaults());

scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   var description = scraper.get('descriptions')[0];
   console.log(title,description);
   // "Sound Good dude American Apparel T-shirts are made with 100% fine jersey cotton combed for softness and comfort.
   // (Athletic Grey and Athletic Blue contain 50% polyester / 25% cotton / 25% rayon)"
});

Use jsdom and zepto

var scrapinode = require('./../');

scrapinode.use(/http:\/\/society6.com/,'title',function(window){
	var $ = window.$;
	var title = $('h1[itemprop="name"]').text();
	return title;
});

var options = {
	url : 'http://society6.com/product/Sounds-Good-Dude_T-shirt',
	engine : 'jsdom+zepto'
};

scrapinode.createScraper(options,function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Take advantage of the middleware system

The first route/operation used in the stack (router) should be specific and the others less and less specific, more generic.

var scrapinode = require('./../');

scrapinode.use(/http:\/\/society6\.com/,'title',function(window){
	var $ = window.$;
   // this selector is really appropriate for product pages not for the index page
	var title = $('h1[itemprop="name"]').text();
   // if title is falsy, the operation will return "null",
   // which means that the next matching route will be used
   // and so its associated operation will be called with the same "window"
   // this logic goes on and on until an operation returns something else than "null"
   console.log('see the title here is empty: "' + title + '"');
   if(!title) return null;
	return title;
});

scrapinode.use('*','title',function(window){
	var $ = window.$;
   var title = $('title').text();
   return title;
});

scrapinode.createScraper('http://society6.com',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Society6 | Affordable Art Prints, iPhone Cases and T-shirts"
});

Use cheerio

var scrapinode = require('./../');

scrapinode.use(/http:\/\/society6.com/,'title',function(window){
	var $ = window.$;
	var title = $('h1[itemprop="name"]').text();
	return title;
});

var options = {
	url : 'http://society6.com/product/Sounds-Good-Dude_T-shirt',
	engine : 'cheerio'
};

scrapinode.createScraper(options,function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Simple usage

Scrap and get the title of the page using the default operators.

var scrapinode = require('./../');

scrapinode.useAll(scrapinode.defaults());

scrapinode.createScraper('http://society6.com/product/Sounds-Good-Dude_T-shirt',function(err,scraper){
   if(err) return console.error(err);
   var title = scraper.get('title');
   console.log(title); // "Sound Good dude"
});

Code documentation

index.js

module.exports = require('./lib/scrapinode')

Expose the scrapinode library

module.exports = require('./lib/scrapinode');

lib/router.js

ScrapinodeError = require('./error/scrapinode-error')

Module dependencies

var ScrapinodeError = require('./error/scrapinode-error');

module.exports = Router

Expose Router

module.exports = Router;

Router()

Router class

function Router(){
	this.routes = [];
}

Router.prototype.addRoute(route)

Add the given route into this.routes

Router.prototype.addRoute = function(route){
	this.routes.push(route);
	return this;
};

Parameters:

route {Route} -

Returns:

{Router}

Router.prototype.dispatch(url,content,index)

Find the relevant operation for the given url and content expected

Router.prototype.dispatch = function(url,content,index){
	for(var i = index; i < this.routes.length; i++){
		var route = this.routes[i];
		if(route.match(url,content)){
			var result = {
				index : i,
				operation : route.operation
			};
			return result;
		}
	}

	var message = 'Dead end, no content found and no more route.';
	var infos = {
		url : url,
		content : content,
		routes : this.routes
	};
	throw new ScrapinodeError(message + 'Informations: ' + JSON.stringify(infos));
};

Parameters:

url {String} -
content {String} -
index {Number} -

Returns:

{Object} -

lib/scrapinode.js

browser = require('./browser'),

Modules dependencies

var browser = require('./browser'),
  Router = require('./router'),
  Route = require('./route'),
  Scraper = require('./scraper'),
  defaults = require('./defaults/');

module.exports = exports = new Scrapinode()

Expose an instance of Scrapinode as a convenience mostly

module.exports = exports = new Scrapinode();

exports.createInstance()

Create an instance of Scrapinode

exports.createInstance = function(engine){
  return new Scrapinode(engine);
};

Returns:

{Scrapinode} a new instance of `Scrapinode`

Scrapinode()

Scrapinode class

function Scrapinode(engine){
  this.engine = engine || 'jsdom';
  this.router = new Router();
};

Scrapinode.prototype.createScraper(options,callback)

Create a scraper for the given url with the given engine

Scrapinode.prototype.createScraper = function (options,callback){
  var self = this;
  if(typeof(options) === 'string'){
    var url = options;
    options = {
      url : url,
      engine : this.engine
    };
  }

  if(!options.engine) options.engine = this.engine;
  if(!options.timeout) options.timeout = 5000;
  if(!options.retries) options.retries = 3;
  if(!options.redirects) options.redirects = 5;
  if(!options.headers) options.headers = {};
  if(!options.headers['user-agent']) options.headers['user-agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.107 Safari/535.1';
  if(!options.headers['accept']) options.headers['accept'] = 'text/html, application/xhtml+xml, application/xml; q=0.9';
  if(!options.headers['accept-charset']) options.headers['accept-charset'] = 'utf-8; q=1.0, ISO-8859-1; q=0.7, *; q=0.3';
  if(!options.headers['accept-encoding']) options.headers['accept-encoding'] = 'gzip, deflate';

  browser.load(options,function(err,window){
    if(err) return callback(err);
    var scraper = new Scraper(options.url,window,self.router);
    callback(err,scraper);
  });
};

Parameters:

options {String|Object} - url of the page or a set of options in this case options.url is mandatory
options.url {String} - url of the page
[options.engine=this.engine] {String} - name of the engine (jsdom or cheerio)
[options.html=undefined] {String} - HTML content
[options.timeout=5000] {Number} - number of milliseconds after which the timeout is reached
[options.retries=3] {Number} - maximum number of times the request is resent
[options.redirects=5] {Number} - maximum number of redirections followed
[options.headers] {Object} - HTTP headers used by the request
callback {Function} - call when the scraper is ready for work
callback().err {Error} -
callback().scraper {Scraper} -

Scrapinode.prototype.use(path,content,operation)

Add the given operation in the router for the given path and content

Scrapinode.prototype.use = function(path,content,operation){
  this.router.addRoute(new Route(path,content,operation));
  return this;
};

Parameters:

path {String|RegExp} -
content {String|RegExp} -
operation {Function} - function called when the path and the content match

Returns:

{Scrapinode} itself, allow to chain methods call

Scrapinode.prototype.useAll(routes)

Add all given routes caracterized by a path, a content and an operation in the router

Scrapinode.prototype.useAll = function(routes){
  var self = this;
  routes.forEach(function(route){
    self.router.addRoute(new Route(route.path,route.content,route.operation));
  });
  return self;
};

Parameters:

routes {Array} -
routes[].path {String|RegExp} -
routes[].content {String|RegExp} -
routes[].operation {Function} -

Returns:

{Scrapinode} itself, allow to chain methods call

Scrapinode.prototype.clearRouter()

Remove all the routes available in the router

Scrapinode.prototype.clearRouter = function(){
  this.router.routes = [];
  return this;
};

Returns:

{Scrapinode} itself, allow to chain methods call

Scrapinode.prototype.defaults()

Expose default routes for the contents "title", "descriptions", "videos", "images"

Scrapinode.prototype.defaults = function(){
  return defaults;
};

Returns:

{Array} routes

lib/scraper.js

module.exports = Scraper

Expose Scraper

module.exports = Scraper;

Scraper(url,window,router)

Scraper class

function Scraper(url,window,router){
	this.url = url;
	this.window = window;
	this.router = router;
}

Parameters:

url {String} -
window {Object} -
router {Router} -

Scraper.prototype.get(content,[index])

Get the given content from the this.router

Scraper.prototype.get = function(content,index) {
	if(!index) index = 0;
	var result = this.router.dispatch(this.url,content,index);
	var trouvaille = result.operation(this.window);
	if(trouvaille === null) return this.get(content,result.index + 1);
	return trouvaille;
};

Parameters:

content {String} -
[index] {Number} -

Returns:

{String|Number|Object|Boolean|null|undefined}

lib/route.js

module.exports = Route

Expose Route

module.exports = Route;

Route(path,content,operation)

Route class

function Route(path,content,operation){
	if(typeof(path) === 'string'){
		if(path === '*') path = '.*';
		path = new RegExp(path);
	}
	if(typeof(content) === 'string'){
		if(content === '*') content = '.*';
		content = new RegExp(content);
	}
	this.path = path;
	this.content = content;
	this.operation = operation;
}

Parameters:

path {String|RegExp} -
content {String|RegExp} -
operation {Function} -

Route.prototype.match(url,content)

Attempt to match the given url with this.path and content with this.content

Route.prototype.match = function(url,content){
	return this.path.test(url) && this.content.test(content);
};

Parameters:

url {String} -
content {String} -

Returns:

{Boolean}

lib/browser.js

jsdom = require('jsdom'),

Modules dependencies

var jsdom = require('jsdom'),
    cheerio = require('cheerio'),
    request = new require('superagent').agent(),
    fs = require('fs'),
    ScrapinodeError = require('./error/scrapinode-error'),
    HTTPError = require('httperror');

jqueryExt = fs.readFileSync(__dirname + '/../deps/jquery-regex-selector.js').toString()

jQuery dependencies

var jqueryExt = fs.readFileSync(__dirname + '/../deps/jquery-regex-selector.js').toString();
var jquery = fs.readFileSync(__dirname + '/../deps/jquery-2.0.2.min.js').toString() + jqueryExt;

zepto = fs.readFileSync(__dirname + '/../deps/zepto-v1.0.js').toString()

Zepto library

var zepto = fs.readFileSync(__dirname + '/../deps/zepto-v1.0.js').toString();

exports.load(options,callback)

Build the DOM of the given page found at options.url or options.html

exports.load = function(options,callback){

    if(options.html){
        return process.nextTick(function(){
            buildDOM(options.html,options.engine,options.url,callback);
        });
    }

    getRequest(options,function(err,body){
        if(err) return callback(err);
        buildDOM(body,options.engine,options.url,callback);
    });
};

Parameters:

options {Object} -
callback {Function} -
callback().err {Error} -
callback().window {Object} -

getRequest(options,callback)

Send an HTTP GET request to options.url

function getRequest(options,callback){
    var destroyed = false;
    var req = request.get(options.url)
        .set(options.headers)
        .timeout(options.timeout)
        .redirects(options.redirects)
        .buffer(false)
        .end(function(err,res){
            if(err) return onError(err);

            // Check HTTP status code
            var isHTTPError = isRedirect(res.status) || isClientError(res.status) || isServerError(res.status);
            if(isHTTPError) return onError(new HTTPError(res.status));

            // Attach event handlers and build the body
            var body = '';
            res.on('data',function(chunk){
                body += chunk;
            });
            res.on('end',function(){
                if(destroyed) return;
                // Check if a HTTP refresh/redirection is present into the HTML page, if yes refreshes/redirects.
                var matches = body.match(/<meta[ ]*http-equiv="REFRESH"[ ]*content="[0-9]{1,};[ ]*URL=(.*?)"[ ]*\/?>/i);
                if(matches && matches[1]){
                    options.url = matches[1];
                    return getRequest(options,callback);
                }
                callback(null,body);
            });
            res.on('error',onError);

            // Check if content-type is an image, if yes destroy the response and build a HTML page with the image in it
            if(isImage(res.headers)){
                res.destroy();
                destroyed = true;
                body = '<!DOCTYPE html><html><head></head><body><img src="' + options.url + '" /></body></html>';
                return callback(null,body);
            }
        });

    // Error event handler
    function onError(err){
        if(options.retries--) return getRequest(options,callback);
        callback(err);
    }
}

Parameters:

options {Object} - configuration of the HTTP GET request
options.headers {String} - set of headers
options.timeout {Number} - timeout
options.redirects {Number} - number of times the request will follow redirection instructions
options.retries {Number} - number of times the request will be resend if the request failed
callback {Function} -

buildDOM(body,engine,url,callback)

Build a DOM representation of the given HTML body

function buildDOM(body,engine,url,callback){
    if(!body){
        return callback(new ScrapinodeError('The HTTP response contains an empty body: "' + body +'"'));
    }

    if(engine === 'jsdom' || engine === 'jsdom+zepto'){
        var library = engine === 'jsdom+zepto' ? zepto : jquery;
        try{
            jsdom.env({
               html: body,
               src : [library],
               done : function(err,window){
                   if(err) return callback(err);
                   if(!window) return callback(new ScrapinodeError('The "window" provides by JSDOM is falsy: ' + window));
                   window.location.href = url;
                   callback(err,window);
                   window.close();
               }
            });
        }catch(err){
            callback(err);
        }
    }else if(engine === 'cheerio'){
        try{
            var $ = cheerio.load(body);
        }catch(err){
            callback(err);
        }
        var window = {
            $ : $,
            location : {
                href : url
            }
        };
        callback(null,window);
    }else{
        callback(new ScrapinodeError('The engine "' + engine + '" is not supported. Scrapinode only supports jsdom and cheerio.'));
    }
}

Parameters:

body {String} - html page
engine {String} - name of the engine used to generate the DOM
url {String} - url of the page containing the given `body`
callback {Function} -

isRedirect(code)

Check if the code is a HTTP redirection status.

function isRedirect(code) {
  return (code >= 300 && code < 399);
}

Parameters:

code {Number}

Returns:

{Boolean}

isClientError(code)

Check if the code is a HTTP client error status.

function isClientError(code){
    return (code >= 400 && code < 499);
}

Parameters:

code {Number}

Returns:

{Boolean}

isServerError(code)

Check if the code is a HTTP server error status.

function isServerError(code){
    return (code >= 500 && code < 599);
}

Parameters:

code {Number}

Returns:

{Boolean}

isImage(headers)

Check if the content of the HTTP body is an image

function isImage(headers){
    var regexImage = /image\//i;
    var contentType = headers ?  headers['content-type'] : '';
    return regexImage.test(contentType);
}

Parameters:

headers {Object} -

Returns:

{Boolean}

lib/utils/index.js

url = require('url'),

Module dependencies

var url = require('url'),
   domains = require('./domains');

exports.isURL(path)

Check if the given path is an URL

exports.isURL = function isURL(path) {
   var regex = /(https?:)?\/\/([\-\w\.]+)+/i;
   return regex.test(path);
};

Parameters:

path {String} -

Returns:

{Boolean}

exports.toURL(path,uri)

Convert a relative/absolute path into an URL

exports.toURL = function toURL(path,uri){
   var absolutePath = path;
   if(!exports.isURL(path)){
      var explodeURL = url.parse(uri);
      // 2 cases: absolute path and relative path to the current pathname
      if( path.charAt(0) === '/'){
         absolutePath = explodeURL.protocol + '//' + explodeURL.host + path;
      }else{
         var explodePathname = explodeURL.pathname.split('/');
         var pathname = explodePathname.slice(0,explodePathname.length - 1).join('/');
         absolutePath = explodeURL.protocol + '//' + (explodeURL.host + '/' +pathname + '/' + path).replace('\/\/','/','g');
      }
   }
   return absolutePath;
};

Parameters:

path {String} - relative/absolute path to a resource
uri {String} - url

Returns:

{String}

exports.inline(text)

Inline the text, remove characters "/n" and " " (space)

exports.inline = function inline(text){
   var explode = text.split('\n').join('').split(' ');
   var size = explode.length;
   for(var i=0; i < size ; i++){
      if(!explode[i]){
         explode.splice(i,1);
         i--;
         size--;
      }
   }
   return explode.join(' ').trim();
};

Parameters:

text {String} -

Returns:

{String}

exports.getWebsiteName(uri)

Get the name of the website from an uri

exports.getWebsiteName = function getWebsiteName(uri){
   var hostname = url.parse(uri).hostname;
   var name = '';
   if(hostname){
      var subdomains = domains;
      var components = hostname.split('.');
      for(var i = components.length -1 ; i >=  0; i-- ){
         if(subdomains[components[i]]){
            subdomains = subdomains[components[i]];
         }else{
            name = components[i];
            break;
         }
      }
   }
   return name;
};

Parameters:

uri {String}

Returns:

{String} - name of the website

lib/error/scrapinode-error.js

util = require('util')

Module dependencies

var util = require('util');

module.exports = ScrapinodeError

Expose ScrapinodeError

module.exports = ScrapinodeError;

ScrapinodeError()

Create a new ScrapinodeError

function ScrapinodeError(message){
   Error.call(this);
   Error.captureStackTrace(this,arguments.callee);
   this.name = 'ScrapinodeError';
   this.message = message;
}

util.inherits(ScrapinodeError,Error);

lib/defaults/index.js

utils = require('./../utils/')

Module dependencies

var utils = require('./../utils/');

module.exports = exports = [

Expose default routes each route is composed of a path, a content name and an operation

module.exports = exports = [
   {
      path : '*',
      content : 'descriptions',
      operation : scrapDescription
   },
   {
      path : '*',
      content : 'title',
      operation : scrapTitle
   },
   {
      path : '*',
      content : 'images',
      operation : scrapImage
   },
   {
      path : '*',
      content : 'videos',
      operation : scrapVideo
   }
];

scrapDescription(window)

Retrieve descriptions of the page

function scrapDescription(window){
   var $ = window.$;
   var url = window.url;
   var descriptions = [];

   // Open Graph protocol by Facebook <meta property="og:description" content="(*)"/>
   $('meta[property="og:description"]').each(function(){
      var content = $(this).attr('content');
      if(content) descriptions.push(content);
   });

   // Schema.org : <* itemprop="description">(*)</*>
   $('[itemprop="description"]').each(function(){
      var text = $(this).text();
      if(text) descriptions.push(text);
   });

   // Meta tag description: <meta property="description" content="(*)" />
   $('meta[name="description"]').each(function(){
      var description = utils.inline($(this).attr('content')).trim();
      if(description) descriptions.push(description);
   });

   // Random text in div and p tags. Oriented product informations
   if(descriptions.length === 0){
      $('div,p').each(function(){
         if( ($(this).attr('class') && $(this).attr('class').toLowerCase() === 'productdesc') || ($(this).attr('id') && $(this).attr('id').toLowerCase() === 'productdesc')){
            var description = utils.inline($(this).text()).trim();
            if(description) descriptions.push(description);
         }
      });
   }
   return descriptions;
}

Parameters:

window {Object} - object representating the window

Returns:

{Array}

isValidExtension(src)

Check if the extension is considered valid

function isValidExtension(src){
   var extension = src.split('.').pop();
   var isValid = ENUM_INVALID_EXTENSIONS[extension] === false ? false : true;
   return isValid;
}

Parameters:

src {String} - url of the image

Returns:

{Boolean} true if valid, false otherwise

scrapImage(window)

Retrieve image urls on the page

function scrapImage(window){
   var $ = window.$;
   var url = window.url;

   var thumbs = [];
   var thumbsRejected = [];
   var title = scrapTitle(window);
   var addToThumbs = function(image,beginning){
      var src = $(image).attr('src');
      if(src && isValidExtension(src) ){
         src = utils.toURL(src,url);
         if(beginning){
            thumbs.unshift(src);
         }else{
            thumbs.push(src);
         }
      }else if(src){
         thumbsRejected.push(src);
      }
   };
   // Open Graph protocol by Facebook: <meta property="og:image" content="(*)"/>
   $('meta[property="og:image"]').each(function(){
      var content = $(this).attr('content');
      if(content) thumbs.push(utils.toURL(content));
   });

   // Schema.org: <img itemprop="image" src="(*)"/>
   $('img[itemprop="image"]').each(function(){
      addToThumbs(this);
   });

   // Oriented product informations
   if(thumbs.length < 1){
      $('img[id*="product"]').each(function(){
          addToThumbs(this);
      });

      $('img[class*="product"]').each(function(){
          addToThumbs(this);
      });
   }

   // Grab all images
   if(thumbs.length < 10){
      $('img').each(function(){
         if($(this).attr('itemprop') === 'image') return;
         var alt = $(this).attr('alt');
         // Leave this test alone
         // the selector 'img[alt="title"]' will not work if the title is like LG 42PT35342" PLASMA TV. Escaping issues.
         // Image where the title of the page is equal to the content of the alt attribute of the image tag.
         if(alt === title){
            addToThumbs(this,true);
         }else{
            addToThumbs(this);
         }
      });
   }

   if(thumbs.length === 0){
      thumbs = thumbsRejected;
   }

   return thumbs;
}

Parameters:

window {Object} -

Returns:

{Array}

scrapTitle(window)

Retrieve the more appropriate title of the page

function scrapTitle(window){
   var $ = window.$;
   var url = window.location.href;

   // Tags or attributes whom can contain a nice title for the page
   var titleTag =  $('title').text().trim();
   var metaTitleTag = $('meta[name="title"]').attr('content');
   var openGraphTitle = $('meta[property="og:title"]').attr('content');
   var h1Tag = $('h1').eq(0).text().trim();
   var itempropNameTag = $('[itemprop="name"]').text().trim();
   var titles = [titleTag, metaTitleTag, openGraphTitle, h1Tag, itempropNameTag];

   // Regex of the web site name
   var nameWebsite = utils.getWebsiteName(url);
   var regex = new RegExp(nameWebsite,'i');
   // Sort to find the best title
   var titlesNotEmpty = titles.filter(function(value){
      return !!value;
   });
   var titlesBest = titlesNotEmpty.filter(function(value){
      return !regex.test(value);
   });
   var bestTitle = (titlesBest && titlesBest[0]) || (titlesNotEmpty && titlesNotEmpty[0]) || '';
   return utils.inline(bestTitle);
}

Parameters:

window {Object} -

Returns:

{String} title of the page

scrapVideo(window)

Retrieve the video urls on the page

function scrapVideo(window){
   var $ = window.$;
   var url = window.location.href;
   var thumbs = [];

   // Open Graph protocol by Facebook: <meta property="og:video" content="(*)"/>
   $('meta').each(function(){
      var property = $(this).attr('property');
      var content = $(this).attr('content');
      if(property === 'og:video' && content){
         thumbs.push(utils.toURL(content));
      }
   });

   $('video, embed').each(function(){
      var src = $(this).attr('src');
      if(src) thumbs.push(utils.toURL(src,url));
   });

   return thumbs;
}

Parameters:

window {Object} -

Returns:

{Array}

Licence

(The MIT License)

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.