Skip to content

Commit

Permalink
introduce passive crawling (#781)
Browse files Browse the repository at this point in the history
* introduce passive crawling

* remove example

* add alienvault

* fix release-test linux workflow

* minor

* omit empty

* add passive ref

* validate CLI flags: can't be used with headless

* duplicate URL check

* format CLI output

* refactor extractor pkg

* add response

* fix go.mod

* minor changes

* misc update

---------

Co-authored-by: mzack <marco.rivoli.nvh@gmail.com>
Co-authored-by: sandeep <8293321+ehsandeep@users.noreply.github.com>
  • Loading branch information
3 people committed Mar 20, 2024
1 parent 769074a commit 50865cf
Show file tree
Hide file tree
Showing 20 changed files with 657 additions and 15 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/release-test.yml
Expand Up @@ -46,7 +46,7 @@ jobs:

# todo: musl compatible?
- name: Install Dependences
run: sudo apt install gcc-aarch64-linux-gnu
run: sudo apt update && sudo apt install gcc-aarch64-linux-gnu

- name: release test
uses: goreleaser/goreleaser-action@v4
Expand Down
7 changes: 6 additions & 1 deletion README.md
Expand Up @@ -29,7 +29,8 @@
![image](https://user-images.githubusercontent.com/8293321/199371558-daba03b6-bf9c-4883-8506-76497c6c3a44.png)

- Fast And fully configurable web crawling
- **Standard** and **Headless** mode support
- **Standard** and **Headless** mode
- **Active** and **Passive** mode
- **JavaScript** parsing / crawling
- Customizable **automatic form filling**
- **Scope control** - Preconfigured field / Regex
Expand Down Expand Up @@ -155,6 +156,10 @@ HEADLESS:
-cwu, -chrome-ws-url string use chrome browser instance launched elsewhere with the debugger listening at this URL
-xhr, -xhr-extraction extract xhr request url,method in jsonl output

PASSIVE:
-ps, -passive enable passive sources to discover target endpoints
-pss, -passive-source string[] passive source to use for url discovery (waybackarchive,commoncrawl,alienvault)

SCOPE:
-cs, -crawl-scope string[] in scope url regex to be followed by crawler
-cos, -crawl-out-scope string[] out of scope url regex to be excluded by crawler
Expand Down
4 changes: 4 additions & 0 deletions cmd/katana/main.go
Expand Up @@ -126,6 +126,10 @@ pipelines offering both headless and non-headless crawling.`)
flagSet.StringVarP(&options.ChromeWSUrl, "chrome-ws-url", "cwu", "", "use chrome browser instance launched elsewhere with the debugger listening at this URL"),
flagSet.BoolVarP(&options.XhrExtraction, "xhr-extraction", "xhr", false, "extract xhr request url,method in jsonl output"),
)
flagSet.CreateGroup("passive", "Passive",
flagSet.BoolVarP(&options.Passive, "passive", "ps", false, "enable passive sources to discover target endpoints"),
flagSet.StringSliceVarP(&options.PassiveSource, "passive-source", "pss", nil, "passive source to use for url discovery (waybackarchive,commoncrawl,alienvault)", goflags.NormalizedStringSliceOptions),
)

flagSet.CreateGroup("scope", "Scope",
flagSet.StringSliceVarP(&options.Scope, "crawl-scope", "cs", nil, "in scope url regex to be followed by crawler", goflags.FileCommaSeparatedStringSliceOptions),
Expand Down
4 changes: 4 additions & 0 deletions go.mod
Expand Up @@ -19,6 +19,7 @@ require (
github.com/projectdiscovery/mapcidr v1.1.16
github.com/projectdiscovery/ratelimit v0.0.33
github.com/projectdiscovery/retryablehttp-go v1.0.52
github.com/projectdiscovery/useragent v0.0.41
github.com/projectdiscovery/utils v0.0.83
github.com/projectdiscovery/wappalyzergo v0.0.113
github.com/remeh/sizedwaitgroup v1.0.0
Expand Down Expand Up @@ -55,6 +56,7 @@ require (
github.com/kataras/jwt v0.1.8 // indirect
github.com/klauspost/compress v1.16.7 // indirect
github.com/klauspost/pgzip v1.2.5 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
Expand All @@ -69,9 +71,11 @@ require (
github.com/projectdiscovery/blackrock v0.0.1 // indirect
github.com/projectdiscovery/gostruct v0.0.2 // indirect
github.com/projectdiscovery/machineid v0.0.0-20240226150047-2e2c51e35983 // indirect
github.com/projectdiscovery/stringsutil v0.0.2 // indirect
github.com/quic-go/quic-go v0.37.7 // indirect
github.com/refraction-networking/utls v1.5.4 // indirect
github.com/rivo/uniseg v0.4.4 // indirect
github.com/rogpeppe/go-internal v1.12.0 // indirect
github.com/sashabaranov/go-openai v1.14.2 // indirect
github.com/shoenig/go-m1cpu v0.1.6 // indirect
github.com/smacker/go-tree-sitter v0.0.0-20230720070738-0d0a9f78d8f8 // indirect
Expand Down
14 changes: 12 additions & 2 deletions go.sum
Expand Up @@ -45,6 +45,7 @@ github.com/cloudflare/circl v1.3.7 h1:qlCDlTPz2n9fu58M0Nh1J/JzcFpfgkFHHX3O35r5vc
github.com/cloudflare/circl v1.3.7/go.mod h1:sRTcRWXGLrKw6yIGJ+l7amYJFfAXbZG0kBSc8r4zxgA=
github.com/cnf/structhash v0.0.0-20201127153200-e1b16c1ebc08 h1:ox2F0PSMlrAAiAdknSRMDrAr8mfxPCfSZolH+/qQnyQ=
github.com/cnf/structhash v0.0.0-20201127153200-e1b16c1ebc08/go.mod h1:pCxVEbcm3AMg7ejXyorUXi6HQCzOIBf7zEDVPtw0/U4=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down Expand Up @@ -126,8 +127,8 @@ github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE
github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
Expand Down Expand Up @@ -192,6 +193,7 @@ github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+q
github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk=
github.com/pierrec/lz4/v4 v4.1.2 h1:qvY3YFXRQE/XB8MlLzJH7mSzBs74eA2gg52YTk6jUPM=
github.com/pierrec/lz4/v4 v4.1.2/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
Expand Down Expand Up @@ -226,6 +228,10 @@ github.com/projectdiscovery/retryabledns v1.0.58 h1:ut1FSB9+GZ6zQIlKJFLqIz2RZs81
github.com/projectdiscovery/retryabledns v1.0.58/go.mod h1:RobmKoNBgngAVE4H9REQtaLP1pa4TCyypHy1MWHT1mY=
github.com/projectdiscovery/retryablehttp-go v1.0.52 h1:E1EXok2oXmX1pwCHMyMKkdbiyp0IUxd5bQ7ZbT8AK+o=
github.com/projectdiscovery/retryablehttp-go v1.0.52/go.mod h1:DITjQ0spJHSL81ALR6BEr+yMw/Nxhw0qSdjwF9mGhjI=
github.com/projectdiscovery/stringsutil v0.0.2 h1:uzmw3IVLJSMW1kEg8eCStG/cGbYYZAja8BH3LqqJXMA=
github.com/projectdiscovery/stringsutil v0.0.2/go.mod h1:EJ3w6bC5fBYjVou6ryzodQq37D5c6qbAYQpGmAy+DC0=
github.com/projectdiscovery/useragent v0.0.41 h1:GWHPIArnz6/rKpfbqlP484QmHiOFERH0tewvmAh1MHE=
github.com/projectdiscovery/useragent v0.0.41/go.mod h1:oXjattkrFK9Y/8c+9/6aBkAA307L/NWQrs28uJaE9ow=
github.com/projectdiscovery/utils v0.0.83 h1:r7OBAuEwe4lyEwTITbCEZytoxvjk/s0Xra2NT+K4fm4=
github.com/projectdiscovery/utils v0.0.83/go.mod h1:2XFoaGD5NPUp6liTRHC2tGmMQnIhQSXscpP3zfAG7iE=
github.com/projectdiscovery/wappalyzergo v0.0.113 h1:aoGOY3iGXX6U1RC2TAVEd/s65BESNYYIqpthZvcsZIk=
Expand All @@ -240,8 +246,12 @@ github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJ
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
github.com/rs/xid v1.5.0 h1:mKX4bl4iPYJtEIxp6CYiUuLQ/8DYMoz0PUdtGgMFRVc=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/sashabaranov/go-openai v1.14.2 h1:5DPTtR9JBjKPJS008/A409I5ntFhUPPGCmaAihcPRyo=
Expand Down
5 changes: 5 additions & 0 deletions internal/runner/options.go
Expand Up @@ -26,6 +26,11 @@ func validateOptions(options *types.Options) error {
if len(options.URLs) == 0 && !fileutil.HasStdin() {
return errorutil.New("no inputs specified for crawler")
}

if options.Headless && options.Passive {
return errorutil.New("headless mode (-headless) and passive mode (-passive) cannot be used together")
}

if (options.HeadlessOptionalArguments != nil || options.HeadlessNoSandbox || options.SystemChromePath != "") && !options.Headless {
return errorutil.New("headless mode (-hl) is required if -ho, -nos or -scp are set")
}
Expand Down
3 changes: 3 additions & 0 deletions internal/runner/runner.go
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/projectdiscovery/katana/pkg/engine"
"github.com/projectdiscovery/katana/pkg/engine/hybrid"
"github.com/projectdiscovery/katana/pkg/engine/parser"
"github.com/projectdiscovery/katana/pkg/engine/passive"
"github.com/projectdiscovery/katana/pkg/engine/standard"
"github.com/projectdiscovery/katana/pkg/types"
"github.com/projectdiscovery/mapcidr"
Expand Down Expand Up @@ -98,6 +99,8 @@ func New(options *types.Options) (*Runner, error) {
switch {
case options.Headless:
crawler, err = hybrid.New(crawlerOptions)
case options.Passive:
crawler, err = passive.New(crawlerOptions)
default:
crawler, err = standard.New(crawlerOptions)
}
Expand Down
15 changes: 8 additions & 7 deletions pkg/engine/common/base.go
Expand Up @@ -72,7 +72,7 @@ func (s *Shared) Enqueue(queue *queue.Queue, navigationRequests ...*navigation.R
// if the user requested anyway out of scope items
// they are sent to output without visiting
if s.Options.Options.DisplayOutScope {
s.Output(nr, nil, ErrOutOfScope)
s.Output(nr, nil, nil, ErrOutOfScope)
}
continue
}
Expand All @@ -95,17 +95,18 @@ func (s *Shared) ValidateScope(URL string, root string) bool {
return err == nil && scopeValidated
}

func (s *Shared) Output(navigationRequest *navigation.Request, navigationResponse *navigation.Response, err error) {
func (s *Shared) Output(navigationRequest *navigation.Request, navigationResponse *navigation.Response, passiveReference *navigation.PassiveReference, err error) {
var errData string
if err != nil {
errData = err.Error()
}
// Write the found result to output
result := &output.Result{
Timestamp: time.Now(),
Request: navigationRequest,
Response: navigationResponse,
Error: errData,
Timestamp: time.Now(),
Request: navigationRequest,
Response: navigationResponse,
PassiveReference: passiveReference,
Error: errData,
}

outputErr := s.Options.OutputWriter.Write(result)
Expand Down Expand Up @@ -223,7 +224,7 @@ func (s *Shared) Do(crawlSession *CrawlSession, doRequest DoRequestFunc) error {

resp, err := doRequest(crawlSession, req)

s.Output(req, resp, err)
s.Output(req, resp, nil, err)

if err != nil {
gologger.Warning().Msgf("Could not request seed URL %s: %s\n", req.URL, err)
Expand Down
3 changes: 3 additions & 0 deletions pkg/engine/passive/doc.go
@@ -0,0 +1,3 @@
// Package passive implements the functionality for a non-headless crawler.
// It uses net/http for making requests and goquery for scraping web page HTML.
package passive
124 changes: 124 additions & 0 deletions pkg/engine/passive/httpclient/httpclient.go
@@ -0,0 +1,124 @@
package httpclient

import (
"bytes"
"context"
"crypto/tls"
"fmt"
"io"
"net"
"net/http"
"net/url"
"time"

"github.com/projectdiscovery/gologger"
"github.com/projectdiscovery/useragent"
)

type HttpClient struct {
Client *http.Client
}

type BasicAuth struct {
Username string
Password string
}

func NewHttpClient(timeout int) *HttpClient {
Transport := &http.Transport{
MaxIdleConns: 100,
MaxIdleConnsPerHost: 100,
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
},
Dial: (&net.Dialer{
Timeout: time.Duration(timeout) * time.Second,
}).Dial,
}

client := &http.Client{
Transport: Transport,
Timeout: time.Duration(timeout) * time.Second,
}

httpClient := &HttpClient{Client: client}

return httpClient
}

func (hc *HttpClient) Get(ctx context.Context, getURL, cookies string, headers map[string]string) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodGet, getURL, cookies, headers, nil, BasicAuth{})
}

func (hc *HttpClient) SimpleGet(ctx context.Context, getURL string) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodGet, getURL, "", map[string]string{}, nil, BasicAuth{})
}

func (hc *HttpClient) Post(ctx context.Context, postURL, cookies string, headers map[string]string, body io.Reader) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodPost, postURL, cookies, headers, body, BasicAuth{})
}

func (hc *HttpClient) SimplePost(ctx context.Context, postURL, contentType string, body io.Reader) (*http.Response, error) {
return hc.HTTPRequest(ctx, http.MethodPost, postURL, "", map[string]string{"Content-Type": contentType}, body, BasicAuth{})
}

func (hc *HttpClient) HTTPRequest(ctx context.Context, method, requestURL, cookies string, headers map[string]string, body io.Reader, basicAuth BasicAuth) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, method, requestURL, body)
if err != nil {
return nil, err
}

userAgent := useragent.PickRandom()
req.Header.Set("User-Agent", userAgent.String())
req.Header.Set("Accept", "*/*")
req.Header.Set("Accept-Language", "en")
req.Header.Set("Connection", "close")

if basicAuth.Username != "" || basicAuth.Password != "" {
req.SetBasicAuth(basicAuth.Username, basicAuth.Password)
}

if cookies != "" {
req.Header.Set("Cookie", cookies)
}

for key, value := range headers {
req.Header.Set(key, value)
}

return httpRequestWrapper(hc.Client, req)
}

func (hc *HttpClient) DiscardHTTPResponse(response *http.Response) {
if response != nil {
_, err := io.Copy(io.Discard, response.Body)
if err != nil {
gologger.Warning().Msgf("Could not discard response body: %s\n", err)
return
}
response.Body.Close()
}
}

func (hc *HttpClient) Close() {
hc.Client.CloseIdleConnections()
}

func httpRequestWrapper(client *http.Client, request *http.Request) (*http.Response, error) {
response, err := client.Do(request)
if err != nil {
return nil, err
}

if response.StatusCode != http.StatusOK {
requestURL, _ := url.QueryUnescape(request.URL.String())

gologger.Debug().MsgFunc(func() string {
buffer := new(bytes.Buffer)
_, _ = buffer.ReadFrom(response.Body)
return fmt.Sprintf("Response for failed request against %s:\n%s", requestURL, buffer.String())
})
return response, fmt.Errorf("unexpected status code %d received from %s", response.StatusCode, requestURL)
}
return response, nil
}

0 comments on commit 50865cf

Please sign in to comment.