Comment Code

This commit is contained in:
paoda 2020-03-23 18:28:44 -05:00
parent eb782d4417
commit 025c9a2908
2 changed files with 68 additions and 11 deletions

View File

@ -2,7 +2,7 @@
* @file Scrapes https://trendogate.com and creates a .csv of historical trending data
* @author Rekai Musuka <rk306597@dal.ca>
* @license MIT
* @version 2020.03.17
* @version 2020.03.23
*/
import { addDays, format } from 'date-fns';
@ -13,7 +13,6 @@ import fs from 'fs';
const BASE_URL = "https://trendogate.com/";
const CSV_PATH = "./data.csv";
const USA_ID = 23424977;
const NUMBER_OF_DAYS = 200;
const WAIT = 500; // in ms
// e.g https://trendogate.com/placebydate/23424977/2015-04-01
@ -28,32 +27,43 @@ while (goal >= tmp) {
tmp = addDays(tmp, 7);
}
/**
* Scrapes Data from trendogate.
*
* @return a Promise with no success value
*/
async function scrapeData(): Promise<void> {
const csv = new CSVManager(CSV_PATH);
csv.createWriteStream();
csv.createWriteStream(); // remains open until we're completely done with scraping
for (let i = 0; i < days.length; i++) {
const url = getNewDateUrl(days[i]);
axios.get(url).then((res: any) => {
const url = getNewDateUrl(days[i]); // Construct a valid trendogate URL given a date object
axios.get(url).then((res: any) => { // GET the URL
if (res.status === 200) {
let trends: Array<Trend> = handleHttpResponse(res.data, days[i]);
process.stdout.write(`URL: ${url}\n`);
writeToCsv(trends, csv);
writeToCsv(trends, csv); // Write the list of trends to the spreadsheet
}
else process.stderr.write(`Server returned with a Status of: ${res.status}\n`);
});
await sleep(WAIT);
await sleep(WAIT); // prevents spamming the server. Decreases chances of us being IP banned.
}
}
/**
* Transforms trendogate HTTP Response into a list of trends from said response.
* @param html String of HTML data
* @param day A Date object representing the day the data from trendogate is from.
* @return An Array of trends
*/
function handleHttpResponse(html: string, day: Date): Array<Trend> {
let trends: Array<Trend> = [];
const $ = cheerio.load(html);
$('div.panel > ul.list-group').children().each((i, child) => {
$('div.panel > ul.list-group').children().each((i, child) => { // Query for getting the elements which house the trends
let term = child.firstChild.firstChild.data;
trends.push(new Trend(i + 1, term, day));
@ -69,10 +79,23 @@ function sleep(ms: number): Promise<unknown> {
});
}
/**
* Creates a valid trendogate URL based on a provided Date Object
* @param date The day of the trends you want to query
*/
function getNewDateUrl(date: Date): string {
return `${BASE_URL}/placebydate/${USA_ID}/${format(date, "yyyy-MM-dd")}`;
}
/**
* Iterates over a list of trends and writes a limited amount of them to a CSV file
*
* Some Contraints:
* * Number of trends per day
* * trend must be Valid Latin Extended Characterset (in UTF-8 of course)
* @param trends List of Trends
* @param csv CSV which will be written to
*/
function writeToCsv(trends: Array<Trend>, csv: CSVManager): void {
for (let i = 0; i < trends.length; i++) {
if (i == 20) break; // Should Only Write 20 per day hopefully.
@ -86,6 +109,9 @@ function writeToCsv(trends: Array<Trend>, csv: CSVManager): void {
}
}
/**
* Manages a single CSV File
*/
class CSVManager {
private stream: fs.WriteStream = null;
private path: string;
@ -94,50 +120,81 @@ class CSVManager {
this.path = path;
}
/**
* Writes Plaintext to the Database
* @param row The line of text which will be written
*/
public write(row: string): void {
if (this.stream !== null) {
this.stream.write(row, err => { if (err) throw err; });
} else process.stderr.write(`Unable to write to "${this.path}". Stream does not exist.`);
}
/**
* Creates a write stream which must be closed later
*
* Also, method checks to see if file exists beforehand. If not, a new file is "touched".
*/
public createWriteStream(): void {
if (!fs.existsSync(this.path)) fs.closeSync(fs.openSync(this.path, "w"));
this.stream = fs.createWriteStream(this.path, { flags: "a" });
}
/**
* Closes an open write stream
*/
public closeWriteStream(): void {
this.stream.end();
this.stream = null;
}
/**
* Gets a Path
*/
public getPath(): string {
return this.path;
}
}
/**
* Represents a trend pulled from trendogate
*/
class Trend {
private name: string;
private date: Date;
private ranking: number;
private ranking: number; // from 1 -> 50, a trends ranking on a specific date
constructor(ranking: number, name: string, date: Date) {
this.name = name.trim();
this.name = name.trim(); // Cuts extra whitespaece
this.date = date;
this.ranking = ranking;
}
/**
* Gets the title of the trend
*/
public getName(): string {
return this.name;
}
/**
* Gets the date the trend is from
*/
public getDate(): Date {
return this.date;
}
/**
* Gets a trend's day - ranking.
*/
public getRanking(): number {
return this.ranking;
}
/**
* Returns a string which is constructed so that it can be
* appende to a CSV File
*/
public toCsv(): string {
return `${this.name}, ${this.ranking}, ${format(this.date, "yyyy-MM-dd")}\n`;
}

View File

@ -1,6 +1,6 @@
{
"name": "trendogate_scraper",
"version": "2020.03.17",
"version": "2020.03.23",
"description": "Scrapes information from https://trendogate.com/ for CSCI 1107",
"main": "index.js",
"repository": "ssh://gitea@git.paoda.moe:31059/paoda/trendogate_scraper.git",