/
app.js
122 lines (105 loc) · 3.62 KB
/
app.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
const fs = require('fs');
const mongoose = require('mongoose');
const async = require('async');
const Bluebird = require('bluebird');
const rp = require('request-promise');
const errors = require('request-promise/errors');
const cheerio = require('cheerio');
// import environmental variables from .env file
require('dotenv').config();
let promise = mongoose.connect(process.env.DATABASE, {
useMongoClient: true,
});
mongoose.Promise = require('bluebird');
mongoose.connection.on('error', (err) => {
console.error(`${err.message}`);
});
//import Listing model
const Listing = require('./models/Listing');
let urls = [];
//import modules
const scrape = require('./lib/scrape');
const analyze = require('./lib/analyze');
let options = {
uri: process.env.URL,
qs: {
'key': process.env.KEY,
'start': 1
},
headers: {
'User-Agent': 'Request-Promise'
},
json: true
};
rp(options) // Initial Custom Search Engine Query
.then(data => {
return scrape.init(data);
})
.then(queryVars => { // Creates an array of request-promises which we pass into next .then();
return scrape.queryPush(queryVars);
})
.then(queries => { // Opens each query brought forward and accesses responses using Bluebird.all();
let promises = queries.map(query => rp(query)); //for each query in queries use .map() to open up the request-promise
return Bluebird.all(promises)
.then(responses => { //all of the responses are stored in one JSON object I've called responses
return scrape.checkApi(responses);
})
.then(responses => {
responses.map(page => { // use .map() to iterate over each item in resposnes, using the .getUrls method to conditionally scrape the craigslist ads from google
if (page.searchInformation === undefined) {
console.log('page.searchInformation still undefined');
return;
}
if (!page.error) {
if (page.searchInformation.totalResults > 0) {
console.log('fuck yeah');
let arr = page.items;
return scrape.getUrls(arr, urls); // return an array of craigslist URLs to use in next .then()
} else {
console.log('nothing to see here');
}
}
}); // end of responses.map();
return urls;
})
.catch(errors.StatusCodeError, (reason) => {
console.log('Error: ' + reason);
}) // error handling for Bluebird.all();
})
.then(urls => {
return scrape.cleanUrls(urls);
})
.then(listings => {
console.log(listings);
listings.forEach((listing, i) => { // for each URL in urls object open up a new rp with the following options
let options = {
uri: listing,
simple: false,
transform: function(body) { //only open up 2xx responses
transform2xxOnly = true;
return cheerio.load(body);
}
};
rp(options)
.then($ => { // Scrape craiglist listing for relevant details
return scrape.scrapeCl($);
})
.then(details => { // download images to working directory
return scrape.getImgs(details);
})
// .then(details => {
// return analyze.persona(details);
// })
.then(details => { // Save Listing object to mongoDB
return scrape.saveListing(details);
})
});// end of urls.forEach
})
.catch(errors.StatusCodeError, (reason) => {
console.log('Error: ' + reason.statusCode);
process.exit();
})
.catch(err => {
console.log(err);
process.exit();
});