This repository has been archived on 2021-06-30. You can view files and clone it, but cannot push or open issues or pull requests.
trendogate_scraper/index.ts

141 lines
3.5 KiB
TypeScript

"use strict";
import { addDays, format } from 'date-fns';
import cheerio from 'cheerio';
import axios from 'axios';
import fs from 'fs';
const BASE_URL = "https://trendogate.com/";
const CSV_PATH = "./data.csv";
const USA_ID = 23424977;
const NUMBER_OF_DAYS = 200;
const WAIT = 500; // in ms
// e.g https://trendogate.com/placebydate/23424977/2015-04-01
// Topic | Position | Date
const now: Date = new Date();
let tmp : Date = now;
let days: Array<Date> = [];
for (let i = 0; i < NUMBER_OF_DAYS; i++) {
tmp = addDays(tmp, -1);
days.push(tmp);
}
async function scrapeData() {
const csv = new CSVManager(CSV_PATH);
csv.createWriteStream();
for (let i = 0; i < days.length; i++) {
const url = getNewDateUrl(days[i]);
axios.get(url).then((res: any) => {
if (res.status === 200) {
let trends: Array<Trend> = handleHttpResponse(res.data, days[i]);
process.stdout.write(`URL: ${url}\n`);
writeToCsv(trends, csv);
}
else process.stderr.write(`Server returned with a Status of: ${res.status}\n`);
});
await sleep(WAIT);
}
}
function handleHttpResponse(html: string, day: Date): Array<Trend> {
let trends: Array<Trend> = [];
const $ = cheerio.load(html);
$('div.panel > ul.list-group').children().each((i, child) => {
let term = child.firstChild.firstChild.data;
trends.push(new Trend(i + 1, term, day));
});
return trends;
}
// SO: https://stackoverflow.com/questions/14249506/how-can-i-wait-in-node-js-javascript-l-need-to-pause-for-a-period-of-time
function sleep(ms: number) {
return new Promise(res => {
setTimeout(res, ms);
});
}
function getNewDateUrl(date: Date): string {
return `${BASE_URL}/placebydate/${USA_ID}/${format(date, "yyyy-MM-dd")}`;
}
function writeToCsv(trends: Array<Trend>, csv: CSVManager) {
for (let i = 0; i < trends.length; i++) {
if (i == 20) break; // Should Only Write 20 per day hopefully.
let trend = trends[i];
// Test if The String is Latin+ Some Other Codepoint ranges.
if (trend.getName().match(/^([A-Za-z\u00C0-\u00D6\u00D8-\u00f6\u00f8-\u00ff\s\#]*)$/i)) {
csv.write(trends[i].toCsv());
}
}
}
class CSVManager {
private stream: fs.WriteStream = null;
private path: string;
constructor(path: string) {
this.path = path;
}
public write(row: string) {
if (this.stream !== null) {
this.stream.write(row, err => { if (err) throw err; });
} else process.stderr.write(`Unable to write to "${this.path}". Stream does not exist.`);
}
public createWriteStream() {
if (!fs.existsSync(this.path)) fs.closeSync(fs.openSync(this.path, "w"));
this.stream = fs.createWriteStream(this.path, { flags: "a" });
}
public closeWriteStream() {
this.stream.end();
this.stream = null;
}
public getPath() {
return this.path;
}
}
class Trend {
private name: string;
private date: Date;
private ranking: number;
constructor(ranking: number, name: string, date: Date) {
this.name = name;
this.date = date;
this.ranking = ranking;
}
public getName() {
return this.name;
}
public getDate() {
return this.date;
}
public getRanking() {
return this.ranking;
}
public toCsv() {
return `${this.name}, ${this.ranking}, ${format(this.date, "yyyy-MM-dd")}\n`;
}
}
scrapeData();