How to scrape data with Node.js and Cheerio!

#codenewbie #javascript #tutorial #pairprogramming

That is a tutorial, but you need at least to know how to install packages and work with (Node.js/Express and MongoDB).

For better purposes I can't share with you the link to the site, which I'm going to scrape. My recommendation to find the website where each link to the property has a number of the page on the end. if you have more complex sites, we will consider another option in the following tutorials, but for now, let's start with a simpler one. Also, you can be more creative and improve this method.

Have you ever did routine work as searching throw the page and trying to get data from it? It might be links, emails, phones, or anything else? You want to do it faster because time is money or at least new opportunities, but you are tired by open a new page again, and again.

The first time I thought about scraping, was the days I've been a participant in one hackathon, and we with my team had to find the way - "How and where can we get the data of participants, and also can we do it fast?". We had an idea to develop interaction among our competitors.

Of course, we did it, but not with the best practice. So, when I backed home, I decided to try something the same, just for fun, and it's time to see the result.

The tools we are going to use is:

Node.js (Express) - yes, don't forget to install node.js
MongoDB(Mongoose)
Cheerio

The first step is to install dependencies(you can use npm):

cheerio
dotenv
mongoose 4.nodemon 5.objects-to-csv
request 7.request-promise

Let's suppose we need to scrape the data about clients:

const mongoose = require("mongoose");

const clientSchema = mongoose.Schema({
  name: {
    type: String,
  },
  email: {
    type: String,
  },
  phone: {
    type: String,
  },
  website: {
    type: String,
  },
  location: {
    type: String,
  },
  companyName: {
    type: String,
  },
  pageUrl: {
    type: String,
  },
  date: {
    type: Date,
    default: Date.now(),
  },
});

module.exports = mongoose.model("clients", clientSchema);

Hope we don't need any explanation on how to create a model for MongoDB, also the structure of a project is your decision.

The next step creates a scraper file I call it worker-script.js

// We works with Node.js Workers, to lunch the threads
const { parentPort} = require("worker_threads");
// Cheerio - we use it to scrapp our data
const $ = require("cheerio");
// request-promise - we need it to be able working with cheerio
const rp = require("request-promise");
// Model - we need a Clients model to save all the scraping data
const Clients = require("./models/client");
// Mongoose - helps us works with MongoDB 
const mongoose = require("mongoose");
// dotenv - you need it to setting your dev environment
require("dotenv").config();


// DB connection
mongoose.connect(process.env.DB_URL, {
    useNewUrlParser: true,
    useUnifiedTopology: true,
    useFindAndModify: false,
  })
  .then(() => console.log("Db is connected!"))
  .catch((err) => console.log(err));

// Lounch threads
parentPort.on("message", (nextId) => {
 // Call our scrapEngine
 ScraperEngine(nextId)
  .then((owner) => {
    //Post message to our worker that some of threads done.
    parentPort.postMessage({status: 'OK', payload: owner});
  })
  .catch((err) => { 
    console.error(err);
  })
});

// ScraperEngine
function ScraperEngine(nextId) {
  // Call request promise - use your link
  return rp(`youLink${nextId}`)
    .then(async function (html) {
      try {
        let clientUrl = `yourLink${nextId}`;
        // Call scraper 
        let client = scrappingData(html, clientUrl);

        // Validate some items when we can't find useful info
        let { name, companyName } = client;
        if(companyName !== '') {
          let ownerIsExist = await Clients.findOne({ name });
          if (ownerIsExist) {
            console.log("This owner is already exist");
          } else {
            // Save data in MongoDB
            let newOwner = new Clients(client);
            newOwner.save();
            return client;
          }
        } else {
          console.log('Page is empty');
        }

      } catch (err) {
        console.log(err);
      }
    })
    .catch(function (err) {
      console.log(err.message);
      throw err;
    });
}

// Scraping function
function scrappingData(html, clientUrl) { 
  // Use cheerio
  // Example of how to get text from our HTML by the class
  // Example of how to get link from our HTML by the class
  let client = {
    name: $(".property-name ", html).text(),
    email: $($(".margin-top5", html)[2], html).text(),
    phone: $($(".margin-top5", html)[1], html).text(),
    website: $(".company-website-url", html).children("a").attr("href"),
    location: $(".button", html).attr("id", "address-btn").html(),
    companyName: $( $(".margin-top10", html)[2]).text().replace(/(\n|\t|\s)+/g, " ").trim(),
    pageUrl: clientUrl
  };

  return client;
}

I had to scrape a lot of data, that why did I choose Node.js Workers as a good idea to do it in an effective way. Let's launch our 'Threads'. For that purpose, we use parentPort.on, when we call our scrapperEngine function, which will save the data.

The last puzzle is our "index.js" file, which we actually should launch (nodemon index.js).

//Create our worker
const { Worker} = require("worker_threads");
//Listen our worker-script
const workerScriptFilePath = require.resolve("./worker-script.js");

// Scraper start point
let nextPropertyId = 2000000;

// The loop and 8 is actually the number of 'Threads'
for (let i = 1; i <=8; i++) {
    // Create 8 new 'Threads'
    const worker = new Worker(workerScriptFilePath, {workerData: i});

    // Actually run our Threads till we have data to scrap
    worker.on("message", (output) => {
      worker.postMessage(++nextPropertyId)
    });
      worker.on("error", (error) => console.log(error));
      worker.on("exit", (code) => {
    });

    worker.postMessage(nextPropertyId++);    
}

Finally, if you want to create an excel file:

const Clients = require("./models/client");
const mongoose = require("mongoose");
const ObjectsToCsv = require('objects-to-csv')

require("dotenv").config();
mongoose.connect(process.env.DB_URL, {
    useNewUrlParser: true,
    useUnifiedTopology: true,
    useFindAndModify: false,
  })
  .then(() => console.log("Db is connected!"))
  .catch((err) => console.log(err));


let agrigateClients = () => {
   Clients.aggregate([ 
    { 
       $group: {
           _id:  "$companyName",
           name: {"$first": "$name"},
            pageUrl: {"$first": "$pageUrl"},
           email: {"$first": "$email"},
           phone: {"$first": "$phone"},
           website: {"$first": "$website"},
           location: {
               "$addToSet": "$location",
           }
       }
   }])
   .then((data) => {
       const csv = new ObjectsToCsv(data);
       csv.toDisk('../excelExport/clientsList.csv').then(() => 
      .catch(err => console.log("This is an error message: " + err))
   })
   .catch(err => {
       console.log(err);
   });
};