Comment Code
This commit is contained in:
parent
eb782d4417
commit
025c9a2908
77
index.ts
77
index.ts
|
@ -2,7 +2,7 @@
|
||||||
* @file Scrapes https://trendogate.com and creates a .csv of historical trending data
|
* @file Scrapes https://trendogate.com and creates a .csv of historical trending data
|
||||||
* @author Rekai Musuka <rk306597@dal.ca>
|
* @author Rekai Musuka <rk306597@dal.ca>
|
||||||
* @license MIT
|
* @license MIT
|
||||||
* @version 2020.03.17
|
* @version 2020.03.23
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { addDays, format } from 'date-fns';
|
import { addDays, format } from 'date-fns';
|
||||||
|
@ -13,7 +13,6 @@ import fs from 'fs';
|
||||||
const BASE_URL = "https://trendogate.com/";
|
const BASE_URL = "https://trendogate.com/";
|
||||||
const CSV_PATH = "./data.csv";
|
const CSV_PATH = "./data.csv";
|
||||||
const USA_ID = 23424977;
|
const USA_ID = 23424977;
|
||||||
const NUMBER_OF_DAYS = 200;
|
|
||||||
const WAIT = 500; // in ms
|
const WAIT = 500; // in ms
|
||||||
|
|
||||||
// e.g https://trendogate.com/placebydate/23424977/2015-04-01
|
// e.g https://trendogate.com/placebydate/23424977/2015-04-01
|
||||||
|
@ -28,32 +27,43 @@ while (goal >= tmp) {
|
||||||
tmp = addDays(tmp, 7);
|
tmp = addDays(tmp, 7);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scrapes Data from trendogate.
|
||||||
|
*
|
||||||
|
* @return a Promise with no success value
|
||||||
|
*/
|
||||||
async function scrapeData(): Promise<void> {
|
async function scrapeData(): Promise<void> {
|
||||||
const csv = new CSVManager(CSV_PATH);
|
const csv = new CSVManager(CSV_PATH);
|
||||||
csv.createWriteStream();
|
csv.createWriteStream(); // remains open until we're completely done with scraping
|
||||||
|
|
||||||
for (let i = 0; i < days.length; i++) {
|
for (let i = 0; i < days.length; i++) {
|
||||||
const url = getNewDateUrl(days[i]);
|
const url = getNewDateUrl(days[i]); // Construct a valid trendogate URL given a date object
|
||||||
axios.get(url).then((res: any) => {
|
axios.get(url).then((res: any) => { // GET the URL
|
||||||
|
|
||||||
if (res.status === 200) {
|
if (res.status === 200) {
|
||||||
let trends: Array<Trend> = handleHttpResponse(res.data, days[i]);
|
let trends: Array<Trend> = handleHttpResponse(res.data, days[i]);
|
||||||
process.stdout.write(`URL: ${url}\n`);
|
process.stdout.write(`URL: ${url}\n`);
|
||||||
writeToCsv(trends, csv);
|
writeToCsv(trends, csv); // Write the list of trends to the spreadsheet
|
||||||
|
|
||||||
}
|
}
|
||||||
else process.stderr.write(`Server returned with a Status of: ${res.status}\n`);
|
else process.stderr.write(`Server returned with a Status of: ${res.status}\n`);
|
||||||
});
|
});
|
||||||
|
|
||||||
await sleep(WAIT);
|
await sleep(WAIT); // prevents spamming the server. Decreases chances of us being IP banned.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transforms trendogate HTTP Response into a list of trends from said response.
|
||||||
|
* @param html String of HTML data
|
||||||
|
* @param day A Date object representing the day the data from trendogate is from.
|
||||||
|
* @return An Array of trends
|
||||||
|
*/
|
||||||
function handleHttpResponse(html: string, day: Date): Array<Trend> {
|
function handleHttpResponse(html: string, day: Date): Array<Trend> {
|
||||||
let trends: Array<Trend> = [];
|
let trends: Array<Trend> = [];
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
$('div.panel > ul.list-group').children().each((i, child) => {
|
$('div.panel > ul.list-group').children().each((i, child) => { // Query for getting the elements which house the trends
|
||||||
let term = child.firstChild.firstChild.data;
|
let term = child.firstChild.firstChild.data;
|
||||||
trends.push(new Trend(i + 1, term, day));
|
trends.push(new Trend(i + 1, term, day));
|
||||||
|
|
||||||
|
@ -69,10 +79,23 @@ function sleep(ms: number): Promise<unknown> {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a valid trendogate URL based on a provided Date Object
|
||||||
|
* @param date The day of the trends you want to query
|
||||||
|
*/
|
||||||
function getNewDateUrl(date: Date): string {
|
function getNewDateUrl(date: Date): string {
|
||||||
return `${BASE_URL}/placebydate/${USA_ID}/${format(date, "yyyy-MM-dd")}`;
|
return `${BASE_URL}/placebydate/${USA_ID}/${format(date, "yyyy-MM-dd")}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Iterates over a list of trends and writes a limited amount of them to a CSV file
|
||||||
|
*
|
||||||
|
* Some Contraints:
|
||||||
|
* * Number of trends per day
|
||||||
|
* * trend must be Valid Latin Extended Characterset (in UTF-8 of course)
|
||||||
|
* @param trends List of Trends
|
||||||
|
* @param csv CSV which will be written to
|
||||||
|
*/
|
||||||
function writeToCsv(trends: Array<Trend>, csv: CSVManager): void {
|
function writeToCsv(trends: Array<Trend>, csv: CSVManager): void {
|
||||||
for (let i = 0; i < trends.length; i++) {
|
for (let i = 0; i < trends.length; i++) {
|
||||||
if (i == 20) break; // Should Only Write 20 per day hopefully.
|
if (i == 20) break; // Should Only Write 20 per day hopefully.
|
||||||
|
@ -86,6 +109,9 @@ function writeToCsv(trends: Array<Trend>, csv: CSVManager): void {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Manages a single CSV File
|
||||||
|
*/
|
||||||
class CSVManager {
|
class CSVManager {
|
||||||
private stream: fs.WriteStream = null;
|
private stream: fs.WriteStream = null;
|
||||||
private path: string;
|
private path: string;
|
||||||
|
@ -94,50 +120,81 @@ class CSVManager {
|
||||||
this.path = path;
|
this.path = path;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes Plaintext to the Database
|
||||||
|
* @param row The line of text which will be written
|
||||||
|
*/
|
||||||
public write(row: string): void {
|
public write(row: string): void {
|
||||||
if (this.stream !== null) {
|
if (this.stream !== null) {
|
||||||
this.stream.write(row, err => { if (err) throw err; });
|
this.stream.write(row, err => { if (err) throw err; });
|
||||||
} else process.stderr.write(`Unable to write to "${this.path}". Stream does not exist.`);
|
} else process.stderr.write(`Unable to write to "${this.path}". Stream does not exist.`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a write stream which must be closed later
|
||||||
|
*
|
||||||
|
* Also, method checks to see if file exists beforehand. If not, a new file is "touched".
|
||||||
|
*/
|
||||||
public createWriteStream(): void {
|
public createWriteStream(): void {
|
||||||
if (!fs.existsSync(this.path)) fs.closeSync(fs.openSync(this.path, "w"));
|
if (!fs.existsSync(this.path)) fs.closeSync(fs.openSync(this.path, "w"));
|
||||||
this.stream = fs.createWriteStream(this.path, { flags: "a" });
|
this.stream = fs.createWriteStream(this.path, { flags: "a" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes an open write stream
|
||||||
|
*/
|
||||||
public closeWriteStream(): void {
|
public closeWriteStream(): void {
|
||||||
this.stream.end();
|
this.stream.end();
|
||||||
this.stream = null;
|
this.stream = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a Path
|
||||||
|
*/
|
||||||
public getPath(): string {
|
public getPath(): string {
|
||||||
return this.path;
|
return this.path;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents a trend pulled from trendogate
|
||||||
|
*/
|
||||||
class Trend {
|
class Trend {
|
||||||
private name: string;
|
private name: string;
|
||||||
private date: Date;
|
private date: Date;
|
||||||
private ranking: number;
|
private ranking: number; // from 1 -> 50, a trends ranking on a specific date
|
||||||
|
|
||||||
constructor(ranking: number, name: string, date: Date) {
|
constructor(ranking: number, name: string, date: Date) {
|
||||||
this.name = name.trim();
|
this.name = name.trim(); // Cuts extra whitespaece
|
||||||
this.date = date;
|
this.date = date;
|
||||||
this.ranking = ranking;
|
this.ranking = ranking;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the title of the trend
|
||||||
|
*/
|
||||||
public getName(): string {
|
public getName(): string {
|
||||||
return this.name;
|
return this.name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the date the trend is from
|
||||||
|
*/
|
||||||
public getDate(): Date {
|
public getDate(): Date {
|
||||||
return this.date;
|
return this.date;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets a trend's day - ranking.
|
||||||
|
*/
|
||||||
public getRanking(): number {
|
public getRanking(): number {
|
||||||
return this.ranking;
|
return this.ranking;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a string which is constructed so that it can be
|
||||||
|
* appende to a CSV File
|
||||||
|
*/
|
||||||
public toCsv(): string {
|
public toCsv(): string {
|
||||||
return `${this.name}, ${this.ranking}, ${format(this.date, "yyyy-MM-dd")}\n`;
|
return `${this.name}, ${this.ranking}, ${format(this.date, "yyyy-MM-dd")}\n`;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "trendogate_scraper",
|
"name": "trendogate_scraper",
|
||||||
"version": "2020.03.17",
|
"version": "2020.03.23",
|
||||||
"description": "Scrapes information from https://trendogate.com/ for CSCI 1107",
|
"description": "Scrapes information from https://trendogate.com/ for CSCI 1107",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"repository": "ssh://gitea@git.paoda.moe:31059/paoda/trendogate_scraper.git",
|
"repository": "ssh://gitea@git.paoda.moe:31059/paoda/trendogate_scraper.git",
|
||||||
|
|
Reference in New Issue