Have you ever wanted to fetch remote HTML data in a similar fashion to cURL or Wget using some backend web service and parse it? If you said no, don’t worry, I said the exact thing until I was challenged to do so. For the challenge, I chose to use Express as my framework and a few libraries to assist in the process of fetching and parsing.
I chose to use the HTTP and HTTPS libraries for fetching the data and the htmlparser2 for parsing the HTML data. These are of course only a few of many alternatives to choose from for such a task.
Before starting, I’m going to assume you’ve already got Express installed an setup. If you’re unfamiliar with this process, check out my tutorial on how to create a simple web application using Express.
Let’s go ahead and create a new Express project on our desktop:
express ParseProject
Now the HTTP and HTTPS libraries should already ship with your Node.js installation. If for any reason, we get into a jam, you can always force install the libraries with the project as your current working directory and the following:
npm install http --save
npm install https --save
I wouldn’t recommend installing the above unless we know for sure they aren’t already available.
Next let’s install the htmlparser2 library for traversing through our HTML:
npm install htmlparser2 --save
By using the --save
flag, it adds reference to these dependencies in our package.json file.
Open the project’s routes/index.js file because this is where we’re going to do all of our coding. Make the top of your file look something like the following:
var express = require('express');
var http = require("http");
var https = require("https");
var htmlparser = require("htmlparser2");
var router = express.Router();
We can now choose to either do all of our coding in the main /
route, or create another route. I’m going to create a new route called /fetch
which will be for fetching HTML data:
router.get("/fetch", function(req, res, next) {
});
To make our code a bit more dynamic, we’re going to allow a URL to be passed in as a GET parameter instead of working with a static website:
if(req.query.url === undefined) {
res.send({message: "url cannot be undefined"});
}
var urlPrefix = req.query.url.match(/.*?:\/\//g);
req.query.url = req.query.url.replace(/.*?:\/\//g, "");
var options = {
hostname: req.query.url
};
if(urlPrefix !== undefined && urlPrefix !== null && urlPrefix[0] === "https://") {
options.port = 443;
https.get(options, function(result) {
processResponse(result);
}).on('error', function(e) {
res.send({message: e.message});
});
} else {
options.port = 80;
http.get(options, function(result) {
processResponse(result);
}).on('error', function(e) {
res.send({message: e.message});
});
}
So what is happening in the above code? We’re checking to see if a url
was passed with the query parameters. If it wasn’t then return an error. If it was, then we are going to use a regular expression to extract the prefix of the URL and then remove it from the URL. We do this because if the URL uses SSL, then we need to use HTTPS rather than HTTP.
The rest leads us to our next function:
var processResponse = function(result) {
var data = "";
result.on("data", function(chunk) {
data += chunk;
});
var tags = [];
var tagsCount = {};
var tagsWithCount = [];
result.on("end", function(chunk) {
var parser = new htmlparser.Parser({
onopentag: function(name, attribs) {
if(tags.indexOf(name) === -1) {
tags.push(name);
tagsCount[name] = 1;
} else {
tagsCount[name]++;
}
},
onend: function() {
for(var i = 0; i < tags.length; i++) {
tagsWithCount.push({name: tags[i], count: tagsCount[tags[i]]});
}
}
}, {decodeEntities: true});
parser.write(data);
parser.end();
res.send({website: req.query.url, port: options.port, data: data, tags: tagsWithCount});
});
}
In the above function we are getting all the remote HTML data and then using the htmlparser2 to figure out what tag names exist and how many of each tag name exist.
We are then choosing to report all this information back.
Here is the code for the full routes/index.js file in case you’l like to see it all put together:
router.get("/fetch", function(req, res, next) {
if(req.query) {
if(req.query.url === undefined) {
res.send({message: "url cannot be undefined"});
}
var urlPrefix = req.query.url.match(/.*?:\/\//g);
req.query.url = req.query.url.replace(/.*?:\/\//g, "");
var options = {
hostname: req.query.url
};
if(urlPrefix !== undefined && urlPrefix !== null && urlPrefix[0] === "https://") {
options.port = 443;
https.get(options, function(result) {
processResponse(result);
}).on('error', function(e) {
res.send({message: e.message});
});
} else {
options.port = 80;
http.get(options, function(result) {
processResponse(result);
}).on('error', function(e) {
res.send({message: e.message});
});
}
var processResponse = function(result) {
var data = "";
result.on("data", function(chunk) {
data += chunk;
});
var tags = [];
var tagsCount = {};
var tagsWithCount = [];
result.on("end", function(chunk) {
var parser = new htmlparser.Parser({
onopentag: function(name, attribs) {
if(tags.indexOf(name) === -1) {
tags.push(name);
tagsCount[name] = 1;
} else {
tagsCount[name]++;
}
},
onend: function() {
for(var i = 0; i < tags.length; i++) {
tagsWithCount.push({name: tags[i], count: tagsCount[tags[i]]});
}
}
}, {decodeEntities: true});
parser.write(data);
parser.end();
res.send({website: req.query.url, port: options.port, data: data, tags: tagsWithCount});
});
}
}
});
How about testing this thing out? With the project as your current directory in the command prompt or terminal, run the following:
npm start
This will start your project on http://localhost:3000
for viewing. From your web browser, hit http://localhost:3000/fetch?url=https://www.slack.com
and see what happens. With a little luck you should get all the HTML data and information about the tags.
If for some reason you wanted to get and parse HTML data with Express for Node.js, you could make use of the HTTP, HTTPS, and htmlparser2 dependencies. There are a lot of alternative libraries that can accomplish the same. We then can create a custom route and return the data as JSON for an API-like response.