Working Build of Site -> CSV Scraper

This commit is contained in:
paoda 2020-03-05 14:44:18 -04:00
parent 0adcb77c12
commit 868477a9ae
5 changed files with 349 additions and 25 deletions

3
.gitignore vendored
View File

@ -76,4 +76,5 @@ typings/
# FuseBox cache
.fusebox/
out/
out/
data.csv

156
index.ts
View File

@ -1,35 +1,147 @@
"use strict";
const SITE_URL = "https://trendogate.com/";
const NUMBER_OF_DAYS = 100;
import { addDays } from 'date-fns';
import cheerio from 'cheerio';
import axios from 'axios';
import fs from 'fs';
const BASE_URL = "https://trendogate.com/";
const CSV_PATH = "./data.csv";
const USA_ID = 23424977;
// https://trendogate.com/placebydate/23424977/2015-04-01
const NUMBER_OF_DAYS = 100;
const WAIT = 5000; // in ms
// Topic | Date | Position
// e.g https://trendogate.com/placebydate/23424977/2015-04-01
// Topic | Position | Date
// SO: https://stackoverflow.com/questions/4345045/javascript-loop-between-date-ranges/14655646
const now: Date = new Date();
let tmp : Date = now;
let days: Array<Date> = [];
for (let i = 0; i < 100; i++) {
tmp = new Date(tmp.getDate() - 1);
for (let i = 0; i < NUMBER_OF_DAYS; i++) {
tmp = addDays(tmp, -1);
days.push(tmp);
}
function getNewDateURL(date: Date): String {
let year: Number = date.getFullYear();
let month: Number = date.getMonth();
let day: Number = date.getDay();
let monthFmt: String;
let dayFmt: String;
if (month < 10) monthFmt = `0${month}`;
else monthFmt = `${month}`;
async function scrapeData() {
const csv = new CSVManager(CSV_PATH);
csv.createWriteStream();
if (day < 10) dayFmt = `0${day}`;
else dayFmt = `${day}`;
for (let i = 0; i < days.length; i++) {
const url = getNewDateUrl(days[i]);
axios.get(url).then((res: any) => {
return `${SITE_URL}/placebydate/${USA_ID}/${year}-${monthFmt}-${dayFmt}`;
}
if (res.status === 200) {
let trends: Array<Trend> = handleHttpResponse(res.data, days[i]);
process.stdout.write(`URL: ${url}\n`);
writeToCsv(trends, csv);
}
else process.stderr.write(`Server returned with a Status of: ${res.status}\n`);
});
await sleep(WAIT);
}
}
function handleHttpResponse(html: string, day: Date): Array<Trend> {
let trends: Array<Trend> = [];
const $ = cheerio.load(html);
$('div.panel > ul.list-group').children().each((i, child) => {
let term = child.firstChild.firstChild.data;
trends.push(new Trend(i + 1, term, day));
});
return trends;
}
// SO: https://stackoverflow.com/questions/14249506/how-can-i-wait-in-node-js-javascript-l-need-to-pause-for-a-period-of-time
function sleep(ms: number) {
return new Promise(res => {
setTimeout(res, ms);
});
}
function getNewDateUrl(date: Date): string {
const year: number = date.getFullYear();
const month: number = date.getMonth();
const day: number = date.getDay();
let monthStr: string;
let dayStr: string;
if (month < 10) monthStr = `0${month}`;
else monthStr = `${month}`;
if (day < 10) dayStr = `0${day}`;
else dayStr = `${day}`;
return `${BASE_URL}/placebydate/${USA_ID}/${year}-${monthStr}-${dayStr}`;
}
function writeToCsv(trends: Array<Trend>, csv: CSVManager) {
for (let i = 0; i < trends.length; i++) {
csv.write(trends[i].toCsv());
}
}
class CSVManager {
private stream: fs.WriteStream = null;
private path: string;
constructor(path: string) {
this.path = path;
}
public write(row: string) {
if (this.stream !== null) {
this.stream.write(row, err => { if (err) throw err; });
} else process.stderr.write(`Unable to write to "${this.path}". Stream does not exist.`);
}
public createWriteStream() {
if (fs.existsSync(this.path)) fs.closeSync(fs.openSync(this.path, "w"));
this.stream = fs.createWriteStream(this.path, { flags: "a" });
}
public closeWriteStream() {
this.stream.end();
this.stream = null;
}
public getPath() {
return this.path;
}
}
class Trend {
private name: string;
private date: Date;
private ranking: number;
constructor(ranking: number, name: string, date: Date) {
this.name = name;
this.date = date;
this.ranking = ranking;
}
public getName() {
return this.name;
}
public getDate() {
return this.date;
}
public getRanking() {
return this.ranking;
}
public toCsv() {
return `${this.name}, ${this.ranking}, ${this.date}\n`;
}
}
scrapeData();

View File

@ -5,6 +5,17 @@
"main": "index.js",
"repository": "ssh://gitea@git.paoda.moe:31059/paoda/trendogate_scraper.git",
"author": "paoda <musukarekai@gmail.com>",
"scripts": {
"start": "node ./out/index.js"
},
"license": "MIT",
"private": true
"private": true,
"dependencies": {
"axios": "^0.19.2",
"cheerio": "^1.0.0-rc.3",
"date-fns": "^2.10.0"
},
"devDependencies": {
"@types/cheerio": "^0.22.16"
}
}

View File

@ -3,7 +3,8 @@
"target": "ES5",
"module": "CommonJS",
"outDir": "out",
"sourceMap": true
"sourceMap": true,
"esModuleInterop": true
},
"compileOnSave": true
}

199
yarn.lock Normal file
View File

@ -0,0 +1,199 @@
# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
# yarn lockfile v1
"@types/cheerio@^0.22.16":
version "0.22.16"
resolved "https://registry.yarnpkg.com/@types/cheerio/-/cheerio-0.22.16.tgz#c748a97b8a6f781b04bbda4a552e11b35bcc77e4"
integrity sha512-bSbnU/D4yzFdzLpp3+rcDj0aQQMIRUBNJU7azPxdqMpnexjUSvGJyDuOBQBHeOZh1mMKgsJm6Dy+LLh80Ew4tQ==
dependencies:
"@types/node" "*"
"@types/node@*":
version "13.7.7"
resolved "https://registry.yarnpkg.com/@types/node/-/node-13.7.7.tgz#1628e6461ba8cc9b53196dfeaeec7b07fa6eea99"
integrity sha512-Uo4chgKbnPNlxQwoFmYIwctkQVkMMmsAoGGU4JKwLuvBefF0pCq4FybNSnfkfRCpC7ZW7kttcC/TrRtAJsvGtg==
axios@^0.19.2:
version "0.19.2"
resolved "https://registry.yarnpkg.com/axios/-/axios-0.19.2.tgz#3ea36c5d8818d0d5f8a8a97a6d36b86cdc00cb27"
integrity sha512-fjgm5MvRHLhx+osE2xoekY70AhARk3a6hkN+3Io1jc00jtquGvxYlKlsFUhmUET0V5te6CcZI7lcv2Ym61mjHA==
dependencies:
follow-redirects "1.5.10"
boolbase@~1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e"
integrity sha1-aN/1++YMUes3cl6p4+0xDcwed24=
cheerio@^1.0.0-rc.3:
version "1.0.0-rc.3"
resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0-rc.3.tgz#094636d425b2e9c0f4eb91a46c05630c9a1a8bf6"
integrity sha512-0td5ijfUPuubwLUu0OBoe98gZj8C/AA+RW3v67GPlGOrvxWjZmBXiBCRU+I8VEiNyJzjth40POfHiz2RB3gImA==
dependencies:
css-select "~1.2.0"
dom-serializer "~0.1.1"
entities "~1.1.1"
htmlparser2 "^3.9.1"
lodash "^4.15.0"
parse5 "^3.0.1"
css-select@~1.2.0:
version "1.2.0"
resolved "https://registry.yarnpkg.com/css-select/-/css-select-1.2.0.tgz#2b3a110539c5355f1cd8d314623e870b121ec858"
integrity sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg=
dependencies:
boolbase "~1.0.0"
css-what "2.1"
domutils "1.5.1"
nth-check "~1.0.1"
css-what@2.1:
version "2.1.3"
resolved "https://registry.yarnpkg.com/css-what/-/css-what-2.1.3.tgz#a6d7604573365fe74686c3f311c56513d88285f2"
integrity sha512-a+EPoD+uZiNfh+5fxw2nO9QwFa6nJe2Or35fGY6Ipw1R3R4AGz1d1TEZrCegvw2YTmZ0jXirGYlzxxpYSHwpEg==
date-fns@^2.10.0:
version "2.10.0"
resolved "https://registry.yarnpkg.com/date-fns/-/date-fns-2.10.0.tgz#abd10604d8bafb0bcbd2ba2e9b0563b922ae4b6b"
integrity sha512-EhfEKevYGWhWlZbNeplfhIU/+N+x0iCIx7VzKlXma2EdQyznVlZhCptXUY+BegNpPW2kjdx15Rvq503YcXXrcA==
debug@=3.1.0:
version "3.1.0"
resolved "https://registry.yarnpkg.com/debug/-/debug-3.1.0.tgz#5bb5a0672628b64149566ba16819e61518c67261"
integrity sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==
dependencies:
ms "2.0.0"
dom-serializer@0:
version "0.2.2"
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-0.2.2.tgz#1afb81f533717175d478655debc5e332d9f9bb51"
integrity sha512-2/xPb3ORsQ42nHYiSunXkDjPLBaEj/xTwUO4B7XCZQTRk7EBtTOPaygh10YAAh2OI1Qrp6NWfpAhzswj0ydt9g==
dependencies:
domelementtype "^2.0.1"
entities "^2.0.0"
dom-serializer@~0.1.1:
version "0.1.1"
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-0.1.1.tgz#1ec4059e284babed36eec2941d4a970a189ce7c0"
integrity sha512-l0IU0pPzLWSHBcieZbpOKgkIn3ts3vAh7ZuFyXNwJxJXk/c4Gwj9xaTJwIDVQCXawWD0qb3IzMGH5rglQaO0XA==
dependencies:
domelementtype "^1.3.0"
entities "^1.1.1"
domelementtype@1, domelementtype@^1.3.0, domelementtype@^1.3.1:
version "1.3.1"
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-1.3.1.tgz#d048c44b37b0d10a7f2a3d5fee3f4333d790481f"
integrity sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w==
domelementtype@^2.0.1:
version "2.0.1"
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.0.1.tgz#1f8bdfe91f5a78063274e803b4bdcedf6e94f94d"
integrity sha512-5HOHUDsYZWV8FGWN0Njbr/Rn7f/eWSQi1v7+HsUVwXgn8nWWlL64zKDkS0n8ZmQ3mlWOMuXOnR+7Nx/5tMO5AQ==
domhandler@^2.3.0:
version "2.4.2"
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-2.4.2.tgz#8805097e933d65e85546f726d60f5eb88b44f803"
integrity sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==
dependencies:
domelementtype "1"
domutils@1.5.1:
version "1.5.1"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.5.1.tgz#dcd8488a26f563d61079e48c9f7b7e32373682cf"
integrity sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=
dependencies:
dom-serializer "0"
domelementtype "1"
domutils@^1.5.1:
version "1.7.0"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.7.0.tgz#56ea341e834e06e6748af7a1cb25da67ea9f8c2a"
integrity sha512-Lgd2XcJ/NjEw+7tFvfKxOzCYKZsdct5lczQ2ZaQY8Djz7pfAD3Gbp8ySJWtreII/vDlMVmxwa6pHmdxIYgttDg==
dependencies:
dom-serializer "0"
domelementtype "1"
entities@^1.1.1, entities@~1.1.1:
version "1.1.2"
resolved "https://registry.yarnpkg.com/entities/-/entities-1.1.2.tgz#bdfa735299664dfafd34529ed4f8522a275fea56"
integrity sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==
entities@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/entities/-/entities-2.0.0.tgz#68d6084cab1b079767540d80e56a39b423e4abf4"
integrity sha512-D9f7V0JSRwIxlRI2mjMqufDrRDnx8p+eEOz7aUM9SuvF8gsBzra0/6tbjl1m8eQHrZlYj6PxqE00hZ1SAIKPLw==
follow-redirects@1.5.10:
version "1.5.10"
resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.5.10.tgz#7b7a9f9aea2fdff36786a94ff643ed07f4ff5e2a"
integrity sha512-0V5l4Cizzvqt5D44aTXbFZz+FtyXV1vrDN6qrelxtfYQKW0KO0W2T/hkE8xvGa/540LkZlkaUjO4ailYTFtHVQ==
dependencies:
debug "=3.1.0"
htmlparser2@^3.9.1:
version "3.10.1"
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-3.10.1.tgz#bd679dc3f59897b6a34bb10749c855bb53a9392f"
integrity sha512-IgieNijUMbkDovyoKObU1DUhm1iwNYE/fuifEoEHfd1oZKZDaONBSkal7Y01shxsM49R4XaMdGez3WnF9UfiCQ==
dependencies:
domelementtype "^1.3.1"
domhandler "^2.3.0"
domutils "^1.5.1"
entities "^1.1.1"
inherits "^2.0.1"
readable-stream "^3.1.1"
inherits@^2.0.1, inherits@^2.0.3:
version "2.0.4"
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
lodash@^4.15.0:
version "4.17.15"
resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.15.tgz#b447f6670a0455bbfeedd11392eff330ea097548"
integrity sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A==
ms@2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/ms/-/ms-2.0.0.tgz#5608aeadfc00be6c2901df5f9861788de0d597c8"
integrity sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=
nth-check@~1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-1.0.2.tgz#b2bd295c37e3dd58a3bf0700376663ba4d9cf05c"
integrity sha512-WeBOdju8SnzPN5vTUJYxYUxLeXpCaVP5i5e0LF8fg7WORF2Wd7wFX/pk0tYZk7s8T+J7VLy0Da6J1+wCT0AtHg==
dependencies:
boolbase "~1.0.0"
parse5@^3.0.1:
version "3.0.3"
resolved "https://registry.yarnpkg.com/parse5/-/parse5-3.0.3.tgz#042f792ffdd36851551cf4e9e066b3874ab45b5c"
integrity sha512-rgO9Zg5LLLkfJF9E6CCmXlSE4UVceloys8JrFqCcHloC3usd/kJCyPDwH2SOlzix2j3xaP9sUX3e8+kvkuleAA==
dependencies:
"@types/node" "*"
readable-stream@^3.1.1:
version "3.6.0"
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.0.tgz#337bbda3adc0706bd3e024426a286d4b4b2c9198"
integrity sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==
dependencies:
inherits "^2.0.3"
string_decoder "^1.1.1"
util-deprecate "^1.0.1"
safe-buffer@~5.2.0:
version "5.2.0"
resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.0.tgz#b74daec49b1148f88c64b68d49b1e815c1f2f519"
integrity sha512-fZEwUGbVl7kouZs1jCdMLdt95hdIv0ZeHg6L7qPeciMZhZ+/gdesW4wgTARkrFWEpspjEATAzUGPG8N2jJiwbg==
string_decoder@^1.1.1:
version "1.3.0"
resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e"
integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==
dependencies:
safe-buffer "~5.2.0"
util-deprecate@^1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"
integrity sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=