Working Build of Site -> CSV Scraper
This commit is contained in:
parent
0adcb77c12
commit
868477a9ae
|
@ -77,3 +77,4 @@ typings/
|
|||
.fusebox/
|
||||
|
||||
out/
|
||||
data.csv
|
152
index.ts
152
index.ts
|
@ -1,35 +1,147 @@
|
|||
"use strict";
|
||||
const SITE_URL = "https://trendogate.com/";
|
||||
const NUMBER_OF_DAYS = 100;
|
||||
|
||||
import { addDays } from 'date-fns';
|
||||
import cheerio from 'cheerio';
|
||||
import axios from 'axios';
|
||||
import fs from 'fs';
|
||||
|
||||
const BASE_URL = "https://trendogate.com/";
|
||||
const CSV_PATH = "./data.csv";
|
||||
const USA_ID = 23424977;
|
||||
// https://trendogate.com/placebydate/23424977/2015-04-01
|
||||
const NUMBER_OF_DAYS = 100;
|
||||
const WAIT = 5000; // in ms
|
||||
|
||||
// Topic | Date | Position
|
||||
// e.g https://trendogate.com/placebydate/23424977/2015-04-01
|
||||
// Topic | Position | Date
|
||||
|
||||
// SO: https://stackoverflow.com/questions/4345045/javascript-loop-between-date-ranges/14655646
|
||||
const now: Date = new Date();
|
||||
let tmp : Date = now;
|
||||
let days: Array<Date> = [];
|
||||
|
||||
for (let i = 0; i < 100; i++) {
|
||||
tmp = new Date(tmp.getDate() - 1);
|
||||
for (let i = 0; i < NUMBER_OF_DAYS; i++) {
|
||||
tmp = addDays(tmp, -1);
|
||||
days.push(tmp);
|
||||
}
|
||||
|
||||
async function scrapeData() {
|
||||
const csv = new CSVManager(CSV_PATH);
|
||||
csv.createWriteStream();
|
||||
|
||||
for (let i = 0; i < days.length; i++) {
|
||||
const url = getNewDateUrl(days[i]);
|
||||
axios.get(url).then((res: any) => {
|
||||
|
||||
function getNewDateURL(date: Date): String {
|
||||
let year: Number = date.getFullYear();
|
||||
let month: Number = date.getMonth();
|
||||
let day: Number = date.getDay();
|
||||
let monthFmt: String;
|
||||
let dayFmt: String;
|
||||
if (res.status === 200) {
|
||||
let trends: Array<Trend> = handleHttpResponse(res.data, days[i]);
|
||||
process.stdout.write(`URL: ${url}\n`);
|
||||
writeToCsv(trends, csv);
|
||||
|
||||
if (month < 10) monthFmt = `0${month}`;
|
||||
else monthFmt = `${month}`;
|
||||
|
||||
if (day < 10) dayFmt = `0${day}`;
|
||||
else dayFmt = `${day}`;
|
||||
|
||||
return `${SITE_URL}/placebydate/${USA_ID}/${year}-${monthFmt}-${dayFmt}`;
|
||||
}
|
||||
else process.stderr.write(`Server returned with a Status of: ${res.status}\n`);
|
||||
});
|
||||
|
||||
await sleep(WAIT);
|
||||
}
|
||||
}
|
||||
|
||||
function handleHttpResponse(html: string, day: Date): Array<Trend> {
|
||||
let trends: Array<Trend> = [];
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
$('div.panel > ul.list-group').children().each((i, child) => {
|
||||
let term = child.firstChild.firstChild.data;
|
||||
trends.push(new Trend(i + 1, term, day));
|
||||
});
|
||||
|
||||
return trends;
|
||||
}
|
||||
|
||||
// SO: https://stackoverflow.com/questions/14249506/how-can-i-wait-in-node-js-javascript-l-need-to-pause-for-a-period-of-time
|
||||
function sleep(ms: number) {
|
||||
return new Promise(res => {
|
||||
setTimeout(res, ms);
|
||||
});
|
||||
}
|
||||
|
||||
function getNewDateUrl(date: Date): string {
|
||||
const year: number = date.getFullYear();
|
||||
const month: number = date.getMonth();
|
||||
const day: number = date.getDay();
|
||||
let monthStr: string;
|
||||
let dayStr: string;
|
||||
|
||||
if (month < 10) monthStr = `0${month}`;
|
||||
else monthStr = `${month}`;
|
||||
|
||||
if (day < 10) dayStr = `0${day}`;
|
||||
else dayStr = `${day}`;
|
||||
|
||||
return `${BASE_URL}/placebydate/${USA_ID}/${year}-${monthStr}-${dayStr}`;
|
||||
}
|
||||
|
||||
function writeToCsv(trends: Array<Trend>, csv: CSVManager) {
|
||||
for (let i = 0; i < trends.length; i++) {
|
||||
csv.write(trends[i].toCsv());
|
||||
}
|
||||
}
|
||||
|
||||
class CSVManager {
|
||||
private stream: fs.WriteStream = null;
|
||||
private path: string;
|
||||
|
||||
constructor(path: string) {
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
public write(row: string) {
|
||||
if (this.stream !== null) {
|
||||
this.stream.write(row, err => { if (err) throw err; });
|
||||
} else process.stderr.write(`Unable to write to "${this.path}". Stream does not exist.`);
|
||||
}
|
||||
|
||||
public createWriteStream() {
|
||||
if (fs.existsSync(this.path)) fs.closeSync(fs.openSync(this.path, "w"));
|
||||
this.stream = fs.createWriteStream(this.path, { flags: "a" });
|
||||
}
|
||||
|
||||
public closeWriteStream() {
|
||||
this.stream.end();
|
||||
this.stream = null;
|
||||
}
|
||||
|
||||
public getPath() {
|
||||
return this.path;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class Trend {
|
||||
private name: string;
|
||||
private date: Date;
|
||||
private ranking: number;
|
||||
|
||||
constructor(ranking: number, name: string, date: Date) {
|
||||
this.name = name;
|
||||
this.date = date;
|
||||
this.ranking = ranking;
|
||||
}
|
||||
|
||||
public getName() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public getDate() {
|
||||
return this.date;
|
||||
}
|
||||
|
||||
public getRanking() {
|
||||
return this.ranking;
|
||||
}
|
||||
|
||||
public toCsv() {
|
||||
return `${this.name}, ${this.ranking}, ${this.date}\n`;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
scrapeData();
|
13
package.json
13
package.json
|
@ -5,6 +5,17 @@
|
|||
"main": "index.js",
|
||||
"repository": "ssh://gitea@git.paoda.moe:31059/paoda/trendogate_scraper.git",
|
||||
"author": "paoda <musukarekai@gmail.com>",
|
||||
"scripts": {
|
||||
"start": "node ./out/index.js"
|
||||
},
|
||||
"license": "MIT",
|
||||
"private": true
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"axios": "^0.19.2",
|
||||
"cheerio": "^1.0.0-rc.3",
|
||||
"date-fns": "^2.10.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/cheerio": "^0.22.16"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,7 +3,8 @@
|
|||
"target": "ES5",
|
||||
"module": "CommonJS",
|
||||
"outDir": "out",
|
||||
"sourceMap": true
|
||||
"sourceMap": true,
|
||||
"esModuleInterop": true
|
||||
},
|
||||
"compileOnSave": true
|
||||
}
|
|
@ -0,0 +1,199 @@
|
|||
# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
|
||||
# yarn lockfile v1
|
||||
|
||||
|
||||
"@types/cheerio@^0.22.16":
|
||||
version "0.22.16"
|
||||
resolved "https://registry.yarnpkg.com/@types/cheerio/-/cheerio-0.22.16.tgz#c748a97b8a6f781b04bbda4a552e11b35bcc77e4"
|
||||
integrity sha512-bSbnU/D4yzFdzLpp3+rcDj0aQQMIRUBNJU7azPxdqMpnexjUSvGJyDuOBQBHeOZh1mMKgsJm6Dy+LLh80Ew4tQ==
|
||||
dependencies:
|
||||
"@types/node" "*"
|
||||
|
||||
"@types/node@*":
|
||||
version "13.7.7"
|
||||
resolved "https://registry.yarnpkg.com/@types/node/-/node-13.7.7.tgz#1628e6461ba8cc9b53196dfeaeec7b07fa6eea99"
|
||||
integrity sha512-Uo4chgKbnPNlxQwoFmYIwctkQVkMMmsAoGGU4JKwLuvBefF0pCq4FybNSnfkfRCpC7ZW7kttcC/TrRtAJsvGtg==
|
||||
|
||||
axios@^0.19.2:
|
||||
version "0.19.2"
|
||||
resolved "https://registry.yarnpkg.com/axios/-/axios-0.19.2.tgz#3ea36c5d8818d0d5f8a8a97a6d36b86cdc00cb27"
|
||||
integrity sha512-fjgm5MvRHLhx+osE2xoekY70AhARk3a6hkN+3Io1jc00jtquGvxYlKlsFUhmUET0V5te6CcZI7lcv2Ym61mjHA==
|
||||
dependencies:
|
||||
follow-redirects "1.5.10"
|
||||
|
||||
boolbase@~1.0.0:
|
||||
version "1.0.0"
|
||||
resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e"
|
||||
integrity sha1-aN/1++YMUes3cl6p4+0xDcwed24=
|
||||
|
||||
cheerio@^1.0.0-rc.3:
|
||||
version "1.0.0-rc.3"
|
||||
resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0-rc.3.tgz#094636d425b2e9c0f4eb91a46c05630c9a1a8bf6"
|
||||
integrity sha512-0td5ijfUPuubwLUu0OBoe98gZj8C/AA+RW3v67GPlGOrvxWjZmBXiBCRU+I8VEiNyJzjth40POfHiz2RB3gImA==
|
||||
dependencies:
|
||||
css-select "~1.2.0"
|
||||
dom-serializer "~0.1.1"
|
||||
entities "~1.1.1"
|
||||
htmlparser2 "^3.9.1"
|
||||
lodash "^4.15.0"
|
||||
parse5 "^3.0.1"
|
||||
|
||||
css-select@~1.2.0:
|
||||
version "1.2.0"
|
||||
resolved "https://registry.yarnpkg.com/css-select/-/css-select-1.2.0.tgz#2b3a110539c5355f1cd8d314623e870b121ec858"
|
||||
integrity sha1-KzoRBTnFNV8c2NMUYj6HCxIeyFg=
|
||||
dependencies:
|
||||
boolbase "~1.0.0"
|
||||
css-what "2.1"
|
||||
domutils "1.5.1"
|
||||
nth-check "~1.0.1"
|
||||
|
||||
css-what@2.1:
|
||||
version "2.1.3"
|
||||
resolved "https://registry.yarnpkg.com/css-what/-/css-what-2.1.3.tgz#a6d7604573365fe74686c3f311c56513d88285f2"
|
||||
integrity sha512-a+EPoD+uZiNfh+5fxw2nO9QwFa6nJe2Or35fGY6Ipw1R3R4AGz1d1TEZrCegvw2YTmZ0jXirGYlzxxpYSHwpEg==
|
||||
|
||||
date-fns@^2.10.0:
|
||||
version "2.10.0"
|
||||
resolved "https://registry.yarnpkg.com/date-fns/-/date-fns-2.10.0.tgz#abd10604d8bafb0bcbd2ba2e9b0563b922ae4b6b"
|
||||
integrity sha512-EhfEKevYGWhWlZbNeplfhIU/+N+x0iCIx7VzKlXma2EdQyznVlZhCptXUY+BegNpPW2kjdx15Rvq503YcXXrcA==
|
||||
|
||||
debug@=3.1.0:
|
||||
version "3.1.0"
|
||||
resolved "https://registry.yarnpkg.com/debug/-/debug-3.1.0.tgz#5bb5a0672628b64149566ba16819e61518c67261"
|
||||
integrity sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==
|
||||
dependencies:
|
||||
ms "2.0.0"
|
||||
|
||||
dom-serializer@0:
|
||||
version "0.2.2"
|
||||
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-0.2.2.tgz#1afb81f533717175d478655debc5e332d9f9bb51"
|
||||
integrity sha512-2/xPb3ORsQ42nHYiSunXkDjPLBaEj/xTwUO4B7XCZQTRk7EBtTOPaygh10YAAh2OI1Qrp6NWfpAhzswj0ydt9g==
|
||||
dependencies:
|
||||
domelementtype "^2.0.1"
|
||||
entities "^2.0.0"
|
||||
|
||||
dom-serializer@~0.1.1:
|
||||
version "0.1.1"
|
||||
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-0.1.1.tgz#1ec4059e284babed36eec2941d4a970a189ce7c0"
|
||||
integrity sha512-l0IU0pPzLWSHBcieZbpOKgkIn3ts3vAh7ZuFyXNwJxJXk/c4Gwj9xaTJwIDVQCXawWD0qb3IzMGH5rglQaO0XA==
|
||||
dependencies:
|
||||
domelementtype "^1.3.0"
|
||||
entities "^1.1.1"
|
||||
|
||||
domelementtype@1, domelementtype@^1.3.0, domelementtype@^1.3.1:
|
||||
version "1.3.1"
|
||||
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-1.3.1.tgz#d048c44b37b0d10a7f2a3d5fee3f4333d790481f"
|
||||
integrity sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w==
|
||||
|
||||
domelementtype@^2.0.1:
|
||||
version "2.0.1"
|
||||
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.0.1.tgz#1f8bdfe91f5a78063274e803b4bdcedf6e94f94d"
|
||||
integrity sha512-5HOHUDsYZWV8FGWN0Njbr/Rn7f/eWSQi1v7+HsUVwXgn8nWWlL64zKDkS0n8ZmQ3mlWOMuXOnR+7Nx/5tMO5AQ==
|
||||
|
||||
domhandler@^2.3.0:
|
||||
version "2.4.2"
|
||||
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-2.4.2.tgz#8805097e933d65e85546f726d60f5eb88b44f803"
|
||||
integrity sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==
|
||||
dependencies:
|
||||
domelementtype "1"
|
||||
|
||||
domutils@1.5.1:
|
||||
version "1.5.1"
|
||||
resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.5.1.tgz#dcd8488a26f563d61079e48c9f7b7e32373682cf"
|
||||
integrity sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=
|
||||
dependencies:
|
||||
dom-serializer "0"
|
||||
domelementtype "1"
|
||||
|
||||
domutils@^1.5.1:
|
||||
version "1.7.0"
|
||||
resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.7.0.tgz#56ea341e834e06e6748af7a1cb25da67ea9f8c2a"
|
||||
integrity sha512-Lgd2XcJ/NjEw+7tFvfKxOzCYKZsdct5lczQ2ZaQY8Djz7pfAD3Gbp8ySJWtreII/vDlMVmxwa6pHmdxIYgttDg==
|
||||
dependencies:
|
||||
dom-serializer "0"
|
||||
domelementtype "1"
|
||||
|
||||
entities@^1.1.1, entities@~1.1.1:
|
||||
version "1.1.2"
|
||||
resolved "https://registry.yarnpkg.com/entities/-/entities-1.1.2.tgz#bdfa735299664dfafd34529ed4f8522a275fea56"
|
||||
integrity sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==
|
||||
|
||||
entities@^2.0.0:
|
||||
version "2.0.0"
|
||||
resolved "https://registry.yarnpkg.com/entities/-/entities-2.0.0.tgz#68d6084cab1b079767540d80e56a39b423e4abf4"
|
||||
integrity sha512-D9f7V0JSRwIxlRI2mjMqufDrRDnx8p+eEOz7aUM9SuvF8gsBzra0/6tbjl1m8eQHrZlYj6PxqE00hZ1SAIKPLw==
|
||||
|
||||
follow-redirects@1.5.10:
|
||||
version "1.5.10"
|
||||
resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.5.10.tgz#7b7a9f9aea2fdff36786a94ff643ed07f4ff5e2a"
|
||||
integrity sha512-0V5l4Cizzvqt5D44aTXbFZz+FtyXV1vrDN6qrelxtfYQKW0KO0W2T/hkE8xvGa/540LkZlkaUjO4ailYTFtHVQ==
|
||||
dependencies:
|
||||
debug "=3.1.0"
|
||||
|
||||
htmlparser2@^3.9.1:
|
||||
version "3.10.1"
|
||||
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-3.10.1.tgz#bd679dc3f59897b6a34bb10749c855bb53a9392f"
|
||||
integrity sha512-IgieNijUMbkDovyoKObU1DUhm1iwNYE/fuifEoEHfd1oZKZDaONBSkal7Y01shxsM49R4XaMdGez3WnF9UfiCQ==
|
||||
dependencies:
|
||||
domelementtype "^1.3.1"
|
||||
domhandler "^2.3.0"
|
||||
domutils "^1.5.1"
|
||||
entities "^1.1.1"
|
||||
inherits "^2.0.1"
|
||||
readable-stream "^3.1.1"
|
||||
|
||||
inherits@^2.0.1, inherits@^2.0.3:
|
||||
version "2.0.4"
|
||||
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
|
||||
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
|
||||
|
||||
lodash@^4.15.0:
|
||||
version "4.17.15"
|
||||
resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.15.tgz#b447f6670a0455bbfeedd11392eff330ea097548"
|
||||
integrity sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A==
|
||||
|
||||
ms@2.0.0:
|
||||
version "2.0.0"
|
||||
resolved "https://registry.yarnpkg.com/ms/-/ms-2.0.0.tgz#5608aeadfc00be6c2901df5f9861788de0d597c8"
|
||||
integrity sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=
|
||||
|
||||
nth-check@~1.0.1:
|
||||
version "1.0.2"
|
||||
resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-1.0.2.tgz#b2bd295c37e3dd58a3bf0700376663ba4d9cf05c"
|
||||
integrity sha512-WeBOdju8SnzPN5vTUJYxYUxLeXpCaVP5i5e0LF8fg7WORF2Wd7wFX/pk0tYZk7s8T+J7VLy0Da6J1+wCT0AtHg==
|
||||
dependencies:
|
||||
boolbase "~1.0.0"
|
||||
|
||||
parse5@^3.0.1:
|
||||
version "3.0.3"
|
||||
resolved "https://registry.yarnpkg.com/parse5/-/parse5-3.0.3.tgz#042f792ffdd36851551cf4e9e066b3874ab45b5c"
|
||||
integrity sha512-rgO9Zg5LLLkfJF9E6CCmXlSE4UVceloys8JrFqCcHloC3usd/kJCyPDwH2SOlzix2j3xaP9sUX3e8+kvkuleAA==
|
||||
dependencies:
|
||||
"@types/node" "*"
|
||||
|
||||
readable-stream@^3.1.1:
|
||||
version "3.6.0"
|
||||
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.0.tgz#337bbda3adc0706bd3e024426a286d4b4b2c9198"
|
||||
integrity sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==
|
||||
dependencies:
|
||||
inherits "^2.0.3"
|
||||
string_decoder "^1.1.1"
|
||||
util-deprecate "^1.0.1"
|
||||
|
||||
safe-buffer@~5.2.0:
|
||||
version "5.2.0"
|
||||
resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.0.tgz#b74daec49b1148f88c64b68d49b1e815c1f2f519"
|
||||
integrity sha512-fZEwUGbVl7kouZs1jCdMLdt95hdIv0ZeHg6L7qPeciMZhZ+/gdesW4wgTARkrFWEpspjEATAzUGPG8N2jJiwbg==
|
||||
|
||||
string_decoder@^1.1.1:
|
||||
version "1.3.0"
|
||||
resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e"
|
||||
integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==
|
||||
dependencies:
|
||||
safe-buffer "~5.2.0"
|
||||
|
||||
util-deprecate@^1.0.1:
|
||||
version "1.0.2"
|
||||
resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"
|
||||
integrity sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=
|
Reference in New Issue