/
crawler.h
103 lines (89 loc) · 2.74 KB
/
crawler.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
// @author: Diga Widyaprana
// @matric: A0114171W
#ifndef CRAWLER_H_
#define CRAWLER_H_
// Includes for sockets
#include <arpa/inet.h>
#include <netdb.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#include <sstream>
#include <vector>
#include "storage.h"
#include "structs.h"
// HTML parser
#include <htmlcxx/html/ParserDom.h>
class Crawler {
private:
/**
* A struct representing a HTTP response.
* One can obtain the header and the body of the HTTP response.
*/
typedef struct {
std::string header;
std::string body;
} HttpResponse;
/**
* A reference to the shared storage instance.
*/
Storage& storage;
public:
Crawler(Storage& store);
/**
* @brief Runs the crawler.
*/
void run();
private:
/**
* @brief Connects the specified socket descriptor to the first
* addressinfo result that can be connected to.
*
* @param ai_results The addressinfo result.
* @param socket_desc The socket file descriptor.
*
* @return - true if connection to any one of the addressinfo result is
* successful
* - false if cannot connect to any of the addressinfo results
*/
bool connect(struct addrinfo* ai_results, int* socket_desc);
/**
* @brief Constructs a HTTP request header to the specified host and
* the specified path
*
* @param[in] host The host
* @param[in] path The path
*
* @return The HTTP request header
*/
std::string construct_req_header(const std::string& host, const std::string& path);
/**
* @brief Parse a stringstream of HTTP response into a
* @code{HttpResponse} struct
*
* @param ss the stringstream containing HTTP response
*
* @return A @code{HttpResponse} struct of the HTTP response string.
*/
HttpResponse parse_response(std::stringstream& ss);
/**
* @brief Extracts the <a> tags of the DOM tree supplied to the method
*
* @param dom The DOM tree
*
* @return A vector of URLs from the <a> tags in the DOM tree.
*/
std::vector<std::string> extract_a_tag(tree<htmlcxx::HTML::Node>& dom);
/**
* @brief Parse a URL string into a @code {Url} struct
*
* @param[in] urlstr The URL string
* @param[in] current_base_url The base URL the crawler is crawling when
* obtaining the URL string
*
* @return The @code{Url} struct of the URL string
*/
struct Url parse_url_string(std::string urlstr, const std::string& current_base_url);
};
#endif // CRAWLER_H_