/** * @file Scrapes https://trendogate.com and creates a .csv of historical trending data * @author Rekai Musuka * @license MIT * @version 2020.03.17 */ import { addDays, format } from 'date-fns'; import cheerio from 'cheerio'; import axios from 'axios'; import fs from 'fs'; const BASE_URL = "https://trendogate.com/"; const CSV_PATH = "./data.csv"; const USA_ID = 23424977; const NUMBER_OF_DAYS = 200; const WAIT = 500; // in ms // e.g https://trendogate.com/placebydate/23424977/2015-04-01 // Topic | Position | Date const goal: Date = new Date(2015, 2, 1); // 2 is March because Month is 0-indexed (why?) let tmp : Date = new Date(); let days: Array = []; while (tmp >= goal) { days.push(tmp); tmp = addDays(tmp, -7); } async function scrapeData() { const csv = new CSVManager(CSV_PATH); csv.createWriteStream(); for (let i = 0; i < days.length; i++) { const url = getNewDateUrl(days[i]); axios.get(url).then((res: any) => { if (res.status === 200) { let trends: Array = handleHttpResponse(res.data, days[i]); process.stdout.write(`URL: ${url}\n`); writeToCsv(trends, csv); } else process.stderr.write(`Server returned with a Status of: ${res.status}\n`); }); await sleep(WAIT); } } function handleHttpResponse(html: string, day: Date): Array { let trends: Array = []; const $ = cheerio.load(html); $('div.panel > ul.list-group').children().each((i, child) => { let term = child.firstChild.firstChild.data; trends.push(new Trend(i + 1, term, day)); }); return trends; } // SO: https://stackoverflow.com/questions/14249506/how-can-i-wait-in-node-js-javascript-l-need-to-pause-for-a-period-of-time function sleep(ms: number) { return new Promise(res => { setTimeout(res, ms); }); } function getNewDateUrl(date: Date): string { return `${BASE_URL}/placebydate/${USA_ID}/${format(date, "yyyy-MM-dd")}`; } function writeToCsv(trends: Array, csv: CSVManager) { for (let i = 0; i < trends.length; i++) { if (i == 20) break; // Should Only Write 20 per day hopefully. let trend = trends[i]; // Test if The String is Latin+ Some Other Codepoint ranges. if (trend.getName().match(/^([A-Za-z\u00C0-\u00D6\u00D8-\u00f6\u00f8-\u00ff\s\#]*)$/i)) { csv.write(trends[i].toCsv()); } } } class CSVManager { private stream: fs.WriteStream = null; private path: string; constructor(path: string) { this.path = path; } public write(row: string) { if (this.stream !== null) { this.stream.write(row, err => { if (err) throw err; }); } else process.stderr.write(`Unable to write to "${this.path}". Stream does not exist.`); } public createWriteStream() { if (!fs.existsSync(this.path)) fs.closeSync(fs.openSync(this.path, "w")); this.stream = fs.createWriteStream(this.path, { flags: "a" }); } public closeWriteStream() { this.stream.end(); this.stream = null; } public getPath() { return this.path; } } class Trend { private name: string; private date: Date; private ranking: number; constructor(ranking: number, name: string, date: Date) { this.name = name; this.date = date; this.ranking = ranking; } public getName() { return this.name; } public getDate() { return this.date; } public getRanking() { return this.ranking; } public toCsv() { return `${this.name}, ${this.ranking}, ${format(this.date, "yyyy-MM-dd")}\n`; } } scrapeData();