How to Scrape a Steemit Post with Javascript (Node.js)

in #steemit7 years ago (edited)


This simple module will parse steemd for some essential data. Data includes images, body, author, url to original content, and more. Its very easy to add to as well. It uses cheerio a simple jQuery like way to load html text into a manipulatable object. It also uses a package for getting data-uris if the imageUris option is passed. It was helpful to use chrome's inspector tool. If you open that up and right click an element you can actually copy the css selector and use it for cheerio. You'll notice I do some processing on the post body, this is because markdown is returned from my posts and I just want the relevant text. Here's the script, let me know if you have any questions

const util = require('util');
const path = require('path');
const request = require('request-promise');
const cheerio = require('cheerio');
const removeMd = require('remove-markdown');
const _ = require('lodash');

const reImgHtml = /(https?:\/\/.*\.(?:png|jpg|svg|gif|bmp))/i;
const reHtml = /(&nbsp;|<([^>]+)>)/ig;
const reUrl = /(?:https?|ftp):\/\/[^\s<>]+/g;
const realnames = {
  donmesswithbeer: 'My Secret Real Name',
};

function removeUrls(str) {
  return str.replace(reUrl, '');
}

function removeHtml(str) {
  return str.replace(reHtml, '');
}

function parseText(body) {
  return removeUrls(removeMd(removeHtml(body)));
}

function getSentences(body) {
  return parseText(body).split('. ');
}

async function getPost(postUrl) {
  const html = await request(postUrl);
  const $ = cheerio.load(html);
  const metatext = $('table > tbody > tr:nth-child(1) > td')[0].children[0].data;
  const meta = JSON.parse(metatext);
  const body = $('div.md > div > pre').text();
  const sentences = getSentences(body);
  const author = $('div.post-head-bar > div > a').text();
  const url = $('div.post-head-bar > a').attr('href');
  const title = $('div h3:first-of-type').text();
  const subtitle = sentences[0];
  const summary = `${sentences.slice(0, 2).join('. ')}.`;
  const parsedBody = parseText(body);
  return { meta, body, author, url, title, subtitle, summary, parsedBody };
}

module.exports = {
  getPost,
};

// example
// you MUST use steemd, not steemit
getPost('https://steemd.com/react/@donmesswithabeer/beautiful-codrop-creative-style-buttons-in-react').catch(console.error).then(console.info) // outputs steemit blog data

Sincerely,
Daniel - CEO http://theflite.io

“The place to improve the world is first in one's own heart and head and hands, and then work outward from there.” *
― Robert M. Pirsig

Sort:  

I find it very good since this kind of progress improves our productivity in Steemit. I always like this kind of subject, excellent.

Thanks, I was frustrated by how complicated it can be to use the api to simply get the contents of a post. So hopefully this makes things easier for everyone