Comment Code
This commit is contained in:
parent
eb782d4417
commit
025c9a2908
77
index.ts
77
index.ts
|
@ -2,7 +2,7 @@
|
|||
* @file Scrapes https://trendogate.com and creates a .csv of historical trending data
|
||||
* @author Rekai Musuka <rk306597@dal.ca>
|
||||
* @license MIT
|
||||
* @version 2020.03.17
|
||||
* @version 2020.03.23
|
||||
*/
|
||||
|
||||
import { addDays, format } from 'date-fns';
|
||||
|
@ -13,7 +13,6 @@ import fs from 'fs';
|
|||
const BASE_URL = "https://trendogate.com/";
|
||||
const CSV_PATH = "./data.csv";
|
||||
const USA_ID = 23424977;
|
||||
const NUMBER_OF_DAYS = 200;
|
||||
const WAIT = 500; // in ms
|
||||
|
||||
// e.g https://trendogate.com/placebydate/23424977/2015-04-01
|
||||
|
@ -28,32 +27,43 @@ while (goal >= tmp) {
|
|||
tmp = addDays(tmp, 7);
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrapes Data from trendogate.
|
||||
*
|
||||
* @return a Promise with no success value
|
||||
*/
|
||||
async function scrapeData(): Promise<void> {
|
||||
const csv = new CSVManager(CSV_PATH);
|
||||
csv.createWriteStream();
|
||||
csv.createWriteStream(); // remains open until we're completely done with scraping
|
||||
|
||||
for (let i = 0; i < days.length; i++) {
|
||||
const url = getNewDateUrl(days[i]);
|
||||
axios.get(url).then((res: any) => {
|
||||
const url = getNewDateUrl(days[i]); // Construct a valid trendogate URL given a date object
|
||||
axios.get(url).then((res: any) => { // GET the URL
|
||||
|
||||
if (res.status === 200) {
|
||||
let trends: Array<Trend> = handleHttpResponse(res.data, days[i]);
|
||||
process.stdout.write(`URL: ${url}\n`);
|
||||
writeToCsv(trends, csv);
|
||||
writeToCsv(trends, csv); // Write the list of trends to the spreadsheet
|
||||
|
||||
}
|
||||
else process.stderr.write(`Server returned with a Status of: ${res.status}\n`);
|
||||
});
|
||||
|
||||
await sleep(WAIT);
|
||||
await sleep(WAIT); // prevents spamming the server. Decreases chances of us being IP banned.
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms trendogate HTTP Response into a list of trends from said response.
|
||||
* @param html String of HTML data
|
||||
* @param day A Date object representing the day the data from trendogate is from.
|
||||
* @return An Array of trends
|
||||
*/
|
||||
function handleHttpResponse(html: string, day: Date): Array<Trend> {
|
||||
let trends: Array<Trend> = [];
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
$('div.panel > ul.list-group').children().each((i, child) => {
|
||||
$('div.panel > ul.list-group').children().each((i, child) => { // Query for getting the elements which house the trends
|
||||
let term = child.firstChild.firstChild.data;
|
||||
trends.push(new Trend(i + 1, term, day));
|
||||
|
||||
|
@ -69,10 +79,23 @@ function sleep(ms: number): Promise<unknown> {
|
|||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a valid trendogate URL based on a provided Date Object
|
||||
* @param date The day of the trends you want to query
|
||||
*/
|
||||
function getNewDateUrl(date: Date): string {
|
||||
return `${BASE_URL}/placebydate/${USA_ID}/${format(date, "yyyy-MM-dd")}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterates over a list of trends and writes a limited amount of them to a CSV file
|
||||
*
|
||||
* Some Contraints:
|
||||
* * Number of trends per day
|
||||
* * trend must be Valid Latin Extended Characterset (in UTF-8 of course)
|
||||
* @param trends List of Trends
|
||||
* @param csv CSV which will be written to
|
||||
*/
|
||||
function writeToCsv(trends: Array<Trend>, csv: CSVManager): void {
|
||||
for (let i = 0; i < trends.length; i++) {
|
||||
if (i == 20) break; // Should Only Write 20 per day hopefully.
|
||||
|
@ -86,6 +109,9 @@ function writeToCsv(trends: Array<Trend>, csv: CSVManager): void {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Manages a single CSV File
|
||||
*/
|
||||
class CSVManager {
|
||||
private stream: fs.WriteStream = null;
|
||||
private path: string;
|
||||
|
@ -94,50 +120,81 @@ class CSVManager {
|
|||
this.path = path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes Plaintext to the Database
|
||||
* @param row The line of text which will be written
|
||||
*/
|
||||
public write(row: string): void {
|
||||
if (this.stream !== null) {
|
||||
this.stream.write(row, err => { if (err) throw err; });
|
||||
} else process.stderr.write(`Unable to write to "${this.path}". Stream does not exist.`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a write stream which must be closed later
|
||||
*
|
||||
* Also, method checks to see if file exists beforehand. If not, a new file is "touched".
|
||||
*/
|
||||
public createWriteStream(): void {
|
||||
if (!fs.existsSync(this.path)) fs.closeSync(fs.openSync(this.path, "w"));
|
||||
this.stream = fs.createWriteStream(this.path, { flags: "a" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes an open write stream
|
||||
*/
|
||||
public closeWriteStream(): void {
|
||||
this.stream.end();
|
||||
this.stream = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a Path
|
||||
*/
|
||||
public getPath(): string {
|
||||
return this.path;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a trend pulled from trendogate
|
||||
*/
|
||||
class Trend {
|
||||
private name: string;
|
||||
private date: Date;
|
||||
private ranking: number;
|
||||
private ranking: number; // from 1 -> 50, a trends ranking on a specific date
|
||||
|
||||
constructor(ranking: number, name: string, date: Date) {
|
||||
this.name = name.trim();
|
||||
this.name = name.trim(); // Cuts extra whitespaece
|
||||
this.date = date;
|
||||
this.ranking = ranking;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the title of the trend
|
||||
*/
|
||||
public getName(): string {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the date the trend is from
|
||||
*/
|
||||
public getDate(): Date {
|
||||
return this.date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a trend's day - ranking.
|
||||
*/
|
||||
public getRanking(): number {
|
||||
return this.ranking;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string which is constructed so that it can be
|
||||
* appende to a CSV File
|
||||
*/
|
||||
public toCsv(): string {
|
||||
return `${this.name}, ${this.ranking}, ${format(this.date, "yyyy-MM-dd")}\n`;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "trendogate_scraper",
|
||||
"version": "2020.03.17",
|
||||
"version": "2020.03.23",
|
||||
"description": "Scrapes information from https://trendogate.com/ for CSCI 1107",
|
||||
"main": "index.js",
|
||||
"repository": "ssh://gitea@git.paoda.moe:31059/paoda/trendogate_scraper.git",
|
||||
|
|
Reference in New Issue