This repository has been archived on 2021-06-30. You can view files and clone it, but cannot push or open issues or pull requests.
paoda 26dc65132b Updated Constraints of the Scraper
No longer just the past 200 or so days. Now the scraper gets all
trending data from 2015-03-01 to <current_day> once a week.
2020-03-17 04:38:30 -03:00

146 lines
3.7 KiB
TypeScript

/**
* @file Scrapes https://trendogate.com and creates a .csv of historical trending data
* @author Rekai Musuka <rk306597@dal.ca>
* @license MIT
* @version 2020.03.17
*/
import { addDays, format } from 'date-fns';
import cheerio from 'cheerio';
import axios from 'axios';
import fs from 'fs';
const BASE_URL = "https://trendogate.com/";
const CSV_PATH = "./data.csv";
const USA_ID = 23424977;
const NUMBER_OF_DAYS = 200;
const WAIT = 500; // in ms
// e.g https://trendogate.com/placebydate/23424977/2015-04-01
// Topic | Position | Date
const goal: Date = new Date(2015, 2, 1); // 2 is March because Month is 0-indexed (why?)
let tmp : Date = new Date();
let days: Array<Date> = [];
while (tmp >= goal) {
days.push(tmp);
tmp = addDays(tmp, -7);
}
async function scrapeData() {
const csv = new CSVManager(CSV_PATH);
csv.createWriteStream();
for (let i = 0; i < days.length; i++) {
const url = getNewDateUrl(days[i]);
axios.get(url).then((res: any) => {
if (res.status === 200) {
let trends: Array<Trend> = handleHttpResponse(res.data, days[i]);
process.stdout.write(`URL: ${url}\n`);
writeToCsv(trends, csv);
}
else process.stderr.write(`Server returned with a Status of: ${res.status}\n`);
});
await sleep(WAIT);
}
}
function handleHttpResponse(html: string, day: Date): Array<Trend> {
let trends: Array<Trend> = [];
const $ = cheerio.load(html);
$('div.panel > ul.list-group').children().each((i, child) => {
let term = child.firstChild.firstChild.data;
trends.push(new Trend(i + 1, term, day));
});
return trends;
}
// SO: https://stackoverflow.com/questions/14249506/how-can-i-wait-in-node-js-javascript-l-need-to-pause-for-a-period-of-time
function sleep(ms: number) {
return new Promise(res => {
setTimeout(res, ms);
});
}
function getNewDateUrl(date: Date): string {
return `${BASE_URL}/placebydate/${USA_ID}/${format(date, "yyyy-MM-dd")}`;
}
function writeToCsv(trends: Array<Trend>, csv: CSVManager) {
for (let i = 0; i < trends.length; i++) {
if (i == 20) break; // Should Only Write 20 per day hopefully.
let trend = trends[i];
// Test if The String is Latin+ Some Other Codepoint ranges.
if (trend.getName().match(/^([A-Za-z\u00C0-\u00D6\u00D8-\u00f6\u00f8-\u00ff\s\#]*)$/i)) {
csv.write(trends[i].toCsv());
}
}
}
class CSVManager {
private stream: fs.WriteStream = null;
private path: string;
constructor(path: string) {
this.path = path;
}
public write(row: string) {
if (this.stream !== null) {
this.stream.write(row, err => { if (err) throw err; });
} else process.stderr.write(`Unable to write to "${this.path}". Stream does not exist.`);
}
public createWriteStream() {
if (!fs.existsSync(this.path)) fs.closeSync(fs.openSync(this.path, "w"));
this.stream = fs.createWriteStream(this.path, { flags: "a" });
}
public closeWriteStream() {
this.stream.end();
this.stream = null;
}
public getPath() {
return this.path;
}
}
class Trend {
private name: string;
private date: Date;
private ranking: number;
constructor(ranking: number, name: string, date: Date) {
this.name = name;
this.date = date;
this.ranking = ranking;
}
public getName() {
return this.name;
}
public getDate() {
return this.date;
}
public getRanking() {
return this.ranking;
}
public toCsv() {
return `${this.name}, ${this.ranking}, ${format(this.date, "yyyy-MM-dd")}\n`;
}
}
scrapeData();