/** * @file Scrapes https://trendogate.com and creates a .csv of historical trending data * @author Rekai Musuka * @license MIT * @version 2020.03.23 */ import { addDays, format } from 'date-fns'; import cheerio from 'cheerio'; import axios from 'axios'; import fs from 'fs'; const BASE_URL = "https://trendogate.com/"; const CSV_PATH = "./data.csv"; const USA_ID = 23424977; const WAIT = 500; // in ms // e.g https://trendogate.com/placebydate/23424977/2015-04-01 // Topic | Position | Date const goal: Date = new Date(); let tmp: Date = new Date(2015, 2, 1); // 2 is March because Month is 0-indexed (why?) let days: Array = []; while (goal >= tmp) { days.push(tmp); tmp = addDays(tmp, 7); } /** * Scrapes Data from trendogate. * * @return a Promise with no success value */ async function scrapeData(): Promise { const csv = new CSVManager(CSV_PATH); csv.createWriteStream(); // remains open until we're completely done with scraping for (let i = 0; i < days.length; i++) { const url = getNewDateUrl(days[i]); // Construct a valid trendogate URL given a date object axios.get(url).then((res: any) => { // GET the URL if (res.status === 200) { let trends: Array = handleHttpResponse(res.data, days[i]); process.stdout.write(`URL: ${url}\n`); writeToCsv(trends, csv); // Write the list of trends to the spreadsheet } else process.stderr.write(`Server returned with a Status of: ${res.status}\n`); }); await sleep(WAIT); // prevents spamming the server. Decreases chances of us being IP banned. } } /** * Transforms trendogate HTTP Response into a list of trends from said response. * @param html String of HTML data * @param day A Date object representing the day the data from trendogate is from. * @return An Array of trends */ function handleHttpResponse(html: string, day: Date): Array { let trends: Array = []; const $ = cheerio.load(html); $('div.panel > ul.list-group').children().each((i, child) => { // Query for getting the elements which house the trends let term = child.firstChild.firstChild.data; trends.push(new Trend(i + 1, term, day)); }); return trends; } // SO: https://stackoverflow.com/questions/14249506/how-can-i-wait-in-node-js-javascript-l-need-to-pause-for-a-period-of-time function sleep(ms: number): Promise { return new Promise(res => { setTimeout(res, ms); }); } /** * Creates a valid trendogate URL based on a provided Date Object * @param date The day of the trends you want to query */ function getNewDateUrl(date: Date): string { return `${BASE_URL}/placebydate/${USA_ID}/${format(date, "yyyy-MM-dd")}`; } /** * Iterates over a list of trends and writes a limited amount of them to a CSV file * * Some Contraints: * * Number of trends per day * * trend must be Valid Latin Extended Characterset (in UTF-8 of course) * @param trends List of Trends * @param csv CSV which will be written to */ function writeToCsv(trends: Array, csv: CSVManager): void { for (let i = 0; i < trends.length; i++) { if (i == 20) break; // Should Only Write 20 per day hopefully. let trend = trends[i]; // Test if The String is Latin+ Some Other Codepoint ranges. if (trend.getName().match(/^([A-Za-z\u00C0-\u00D6\u00D8-\u00f6\u00f8-\u00ff\s\#]*)$/i)) { csv.write(trends[i].toCsv()); } } } /** * Manages a single CSV File */ class CSVManager { private stream: fs.WriteStream = null; private path: string; constructor(path: string) { this.path = path; } /** * Writes Plaintext to the Database * @param row The line of text which will be written */ public write(row: string): void { if (this.stream !== null) { this.stream.write(row, err => { if (err) throw err; }); } else process.stderr.write(`Unable to write to "${this.path}". Stream does not exist.`); } /** * Creates a write stream which must be closed later * * Also, method checks to see if file exists beforehand. If not, a new file is "touched". */ public createWriteStream(): void { if (!fs.existsSync(this.path)) fs.closeSync(fs.openSync(this.path, "w")); this.stream = fs.createWriteStream(this.path, { flags: "a" }); } /** * Closes an open write stream */ public closeWriteStream(): void { this.stream.end(); this.stream = null; } /** * Gets a Path */ public getPath(): string { return this.path; } } /** * Represents a trend pulled from trendogate */ class Trend { private name: string; private date: Date; private ranking: number; // from 1 -> 50, a trends ranking on a specific date constructor(ranking: number, name: string, date: Date) { this.name = name.trim(); // Cuts extra whitespaece this.date = date; this.ranking = ranking; } /** * Gets the title of the trend */ public getName(): string { return this.name; } /** * Gets the date the trend is from */ public getDate(): Date { return this.date; } /** * Gets a trend's day - ranking. */ public getRanking(): number { return this.ranking; } /** * Returns a string which is constructed so that it can be * appende to a CSV File */ public toCsv(): string { return `${this.name}, ${this.ranking}, ${format(this.date, "yyyy-MM-dd")}\n`; } } scrapeData();