Skip to content

Commit 159eed5

Browse files
Merge pull request #7 from cyclone-github/dev
v0.9.0
2 parents 7eaad95 + e3c17d4 commit 159eed5

File tree

5 files changed

+130
-42
lines changed

5 files changed

+130
-42
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1+
### v0.9.0
2+
```
3+
added flag "-url-match" to only crawl URLs containing a specified keyword; https://github.com/cyclone-github/spider/issues/6
4+
added notice to user if no URLs are crawled when using "-crawl 1 -url-match"
5+
exit early if zero URLs were crawled (no processing or file output)
6+
use custom User-Agent "Spider/0.9.0 (+https://github.com/cyclone-github/spider)"
7+
removed clearScreen function and its imports
8+
fixed crawl-depth calculation logic
9+
fixed restrict link collection to .html, .htm, .txt and extension-less paths
10+
upgraded dependencies and bumped Go version to v1.24.3
11+
```
12+
### v0.8.1
13+
```
14+
updated default -delay to 10ms
15+
```
116
### v0.8.0
217
```
318
added flag "-file" to allow creating ngrams from a local plaintext file (ex: foobar.txt)

README.md

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
# Spider: URL Mode
1010
```
11-
spider -url 'https://forum.hashpwn.net' -crawl 2 -delay 20 -sort -ngram 1-3 -timeout 1 -o forum.hashpwn.net_spider.txt
11+
spider -url 'https://forum.hashpwn.net' -crawl 2 -delay 20 -sort -ngram 1-3 -timeout 1 -url-match wordlist -o forum.hashpwn.net_spider.txt
1212
```
1313
```
1414
----------------------
@@ -21,15 +21,15 @@ Crawl depth: 2
2121
ngram len: 1-3
2222
Crawl delay: 20ms (increase this to avoid rate limiting)
2323
Timeout: 1 sec
24-
URLs crawled: 56
24+
URLs crawled: 2
2525
Processing... [====================] 100.00%
26-
Unique words: 3164
27-
Unique ngrams: 17313
26+
Unique words: 475
27+
Unique ngrams: 1977
2828
Sorting n-grams by frequency...
2929
Writing... [====================] 100.00%
3030
Output file: forum.hashpwn.net_spider.txt
31-
RAM used: 0.03 GB
32-
Runtime: 8.634s
31+
RAM used: 0.02 GB
32+
Runtime: 2.283s
3333
```
3434
# Spider: File Mode
3535
```
@@ -66,10 +66,37 @@ Wordlist & ngram creation tool to crawl a given url or process a local file to c
6666
- `spider -url 'https://github.com/cyclone-github' -timeout 2`
6767
- To create ngrams len 1-3 and sort output by frequency, use "-ngram 1-3" "-sort"
6868
- `spider -url 'https://github.com/cyclone-github' -ngram 1-3 -sort`
69+
- To filter crawled URLs by keyword "foobar"
70+
- `spider -url 'https://github.com/cyclone-github' -url-match foobar`
6971
- To process a local text file, create ngrams len 1-3 and sort output by frequency
7072
- `spider -file foobar.txt -ngram 1-3 -sort`
7173
- Run `spider -help` to see a list of all options
7274

75+
### spider -help
76+
```
77+
-crawl int
78+
Depth of links to crawl (default 1)
79+
-cyclone
80+
Display coded message
81+
-delay int
82+
Delay in ms between each URL lookup to avoid rate limiting (default 10)
83+
-file string
84+
Path to a local file to scrape
85+
-url-match string
86+
Only crawl URLs containing this keyword (case-insensitive)
87+
-ngram string
88+
Lengths of n-grams (e.g., "1-3" for 1, 2, and 3-length n-grams). (default "1")
89+
-o string
90+
Output file for the n-grams
91+
-sort
92+
Sort output by frequency
93+
-timeout int
94+
Timeout for URL crawling in seconds (default 1)
95+
-url string
96+
URL of the website to scrape
97+
-version
98+
Display version
99+
```
73100
### Compile from source:
74101
- If you want the latest features, compiling from source is the best option since the release version may run several revisions behind the source code.
75102
- This assumes you have Go and Git installed

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
module spider
22

3-
go 1.24.1
3+
go 1.24.3
44

55
require github.com/PuerkitoBio/goquery v1.10.3
66

77
require (
88
github.com/andybalholm/cascadia v1.3.3 // indirect
9-
golang.org/x/net v0.39.0 // indirect
9+
golang.org/x/net v0.40.0 // indirect
1010
)

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
2424
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
2525
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
2626
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
27-
golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY=
28-
golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E=
27+
golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY=
28+
golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds=
2929
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
3030
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
3131
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=

spider.go

Lines changed: 78 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import (
88
"net/http"
99
"net/url"
1010
"os"
11-
"os/exec"
11+
"path"
1212
"path/filepath"
1313
"runtime"
1414
"sort"
@@ -23,12 +23,13 @@ import (
2323
/*
2424
cyclone's url spider
2525
spider will crawl a url and create a wordlist, or use flag -ngram to create ngrams
26-
version 0.5.10; initial github release
27-
version 0.6.2;
26+
v0.5.10;
27+
initial github release
28+
v0.6.2;
2829
fixed scraping logic & ngram creations bugs
2930
switched from gocolly to goquery for web scraping
3031
remove dups from word / ngrams output
31-
version 0.7.0;
32+
v0.7.0;
3233
added feature to allow crawling specific file extensions (html, htm, txt)
3334
added check to keep crawler from crawling offsite URLs
3435
added flag "-delay" to avoid rate limiting (-delay 100 == 100ms delay between URL requests)
@@ -38,20 +39,32 @@ version 0.7.0;
3839
fixed bug when attempting to crawl deeper than available URLs to crawl
3940
fixed crawl depth calculation
4041
optimized code which runs 2.8x faster vs v0.6.x during bench testing
41-
version 0.7.1;
42+
v0.7.1;
4243
added progress bars to word / ngrams processing & file writing operations
4344
added RAM usage monitoring
4445
optimized order of operations for faster processing with less RAM
4546
TO-DO: refactor code (func main is getting messy)
46-
TO-DO: add -file flag to allow crawling local plaintext files such as an ebook.txt
47+
TO-DO: add -file flag to allow crawling local plaintext files such as an ebook.txt (COMPLETED in v0.8.0)
4748
v0.8.0;
4849
added flag "-file" to allow creating ngrams from a local plaintext file (ex: foobar.txt)
4950
added flag "-timeout" for -url mode
5051
added flag "-sort" which sorts output by frequency
5152
fixed several small bugs
53+
v0.8.1;
54+
updated default -delay to 10ms
55+
v0.9.0;
56+
added flag "-url-match" to only crawl URLs containing a specified keyword; https://github.com/cyclone-github/spider/issues/6
57+
added notice to user if no URLs are crawled when using "-crawl 1 -url-match"
58+
exit early if zero URLs were crawled (no processing or file output)
59+
use custom User-Agent "Spider/0.9.0 (+https://github.com/cyclone-github/spider)"
60+
removed clearScreen function and its imports
61+
fixed crawl-depth calculation logic
62+
fixed restrict link collection to .html, .htm, .txt and extension-less paths
63+
upgraded dependencies and bumped Go version to v1.24.3
5264
*/
5365

5466
// clear screen function
67+
/*
5568
func clearScreen() {
5669
var cmd *exec.Cmd
5770
@@ -71,6 +84,7 @@ func clearScreen() {
7184
os.Exit(1)
7285
}
7386
}
87+
*/
7488

7589
// goquery
7690
func getDocumentFromURL(targetURL string, timeout time.Duration) (*goquery.Document, bool, error) {
@@ -79,7 +93,7 @@ func getDocumentFromURL(targetURL string, timeout time.Duration) (*goquery.Docum
7993
if err != nil {
8094
return nil, false, err
8195
}
82-
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
96+
req.Header.Set("User-Agent", "Spider/0.9.0 (+https://github.com/cyclone-github/spider)")
8397

8498
res, err := client.Do(req)
8599
if err != nil {
@@ -106,16 +120,23 @@ func hasAnySuffix(s string, suffixes []string) bool {
106120

107121
func getLinksFromDocument(doc *goquery.Document, baseURL string) []string {
108122
var links []string
109-
validSuffixes := []string{".html", ".htm", ".txt"} // specifically crawl file types, ex: if listed in a file server
110-
111-
doc.Find("a[href]").Each(func(index int, item *goquery.Selection) {
112-
link, exists := item.Attr("href")
113-
if exists {
114-
absoluteLink := joinURL(baseURL, link) // convert to absolute URL
115-
// crawl any non-anchor or valid-file-type link
116-
if hasAnySuffix(link, validSuffixes) || !strings.HasPrefix(link, "#") {
117-
links = append(links, absoluteLink)
118-
}
123+
validSuffixes := map[string]bool{
124+
".html": true,
125+
".htm": true,
126+
".txt": true,
127+
}
128+
129+
doc.Find("a[href]").Each(func(_ int, item *goquery.Selection) {
130+
href, exists := item.Attr("href")
131+
if !exists || strings.HasPrefix(href, "#") {
132+
return
133+
}
134+
absoluteLink := joinURL(baseURL, href)
135+
136+
// only allow approved extensions or none at all
137+
ext := strings.ToLower(path.Ext(absoluteLink))
138+
if ext == "" || validSuffixes[ext] {
139+
links = append(links, absoluteLink)
119140
}
120141
})
121142
return links
@@ -128,7 +149,7 @@ func getTextFromDocument(doc *goquery.Document) string {
128149
return doc.Text()
129150
}
130151

131-
func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, urlCountChan chan<- int, textsChan chan<- string, visited map[string]bool) {
152+
func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, urlCountChan chan<- int, textsChan chan<- string, visited map[string]bool, urlMatchStr string) {
132153
if visited[u] {
133154
return
134155
}
@@ -142,28 +163,37 @@ func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, urlCo
142163
if !isSuccess {
143164
return
144165
}
145-
urlCountChan <- 1 // URL processed
146166

147-
text := getTextFromDocument(doc)
148-
textsChan <- text // send the text for later n-gram processing
167+
// only count & scrape text if it contains -url-match
168+
if urlMatchStr == "" || strings.Contains(strings.ToLower(u), urlMatchStr) {
169+
urlCountChan <- 1 // URL processed
170+
textsChan <- getTextFromDocument(doc) // send the text for later n-gram processing
171+
}
149172

150173
if depth > 1 {
151174
baseDomain, err := getBaseDomain(u)
152175
if err != nil {
153176
fmt.Fprintf(os.Stderr, "Error getting base domain: %v\n", err)
154177
return
155178
}
156-
links := getLinksFromDocument(doc, u)
157-
for _, link := range links {
179+
for _, link := range getLinksFromDocument(doc, u) {
158180
time.Sleep(time.Duration(delay) * time.Millisecond)
181+
159182
linkDomain, err := getBaseDomain(link)
160183
if err != nil {
161184
fmt.Fprintf(os.Stderr, "Error parsing link %s: %v\n", link, err)
162185
continue
163186
}
164-
if linkDomain == baseDomain {
165-
crawlAndScrape(link, depth-1, delay, timeout, urlCountChan, textsChan, visited)
187+
if linkDomain != baseDomain {
188+
continue
189+
}
190+
191+
// only *descend* into children that match (if urlMatchStr was provided)
192+
if urlMatchStr != "" && !strings.Contains(strings.ToLower(link), urlMatchStr) {
193+
continue
166194
}
195+
196+
crawlAndScrape(link, depth-1, delay, timeout, urlCountChan, textsChan, visited, urlMatchStr)
167197
}
168198
}
169199
}
@@ -225,7 +255,7 @@ func monitorRAMUsage(stopChan chan bool, maxRAMUsage *float64) {
225255

226256
// main function
227257
func main() {
228-
clearScreen()
258+
//clearScreen()
229259

230260
cycloneFlag := flag.Bool("cyclone", false, "Display coded message")
231261
versionFlag := flag.Bool("version", false, "Display version")
@@ -234,9 +264,10 @@ func main() {
234264
ngramFlag := flag.String("ngram", "1", "Lengths of n-grams (e.g., \"1-3\" for 1, 2, and 3-length n-grams).")
235265
oFlag := flag.String("o", "", "Output file for the n-grams")
236266
crawlFlag := flag.Int("crawl", 1, "Depth of links to crawl")
237-
delayFlag := flag.Int("delay", 0, "Delay in ms between each URL lookup to avoid rate limiting")
267+
delayFlag := flag.Int("delay", 10, "Delay in ms between each URL lookup to avoid rate limiting")
238268
timeoutFlag := flag.Int("timeout", 1, "Timeout for URL crawling in seconds")
239269
sortFlag := flag.Bool("sort", false, "Sort output by frequency")
270+
urlMatchFlag := flag.String("url-match", "", "Only crawl URLs containing this keyword (case-insensitive)")
240271
flag.Parse()
241272

242273
if *cycloneFlag {
@@ -246,7 +277,7 @@ func main() {
246277
os.Exit(0)
247278
}
248279
if *versionFlag {
249-
version := "Cyclone's URL Spider v0.8.0"
280+
version := "Cyclone's URL Spider v0.9.0"
250281
fmt.Fprintln(os.Stderr, version)
251282
os.Exit(0)
252283
}
@@ -259,6 +290,8 @@ func main() {
259290
}
260291
fileMode := *fileFlag != ""
261292

293+
urlMatchStr := strings.ToLower(*urlMatchFlag)
294+
262295
var baseDomain string
263296
if !fileMode {
264297
// URL mode
@@ -333,7 +366,7 @@ func main() {
333366
fmt.Fprintf(os.Stderr, "Base domain:\t%s\n", baseDomain)
334367
fmt.Fprintf(os.Stderr, "Crawl depth:\t%d\n", *crawlFlag)
335368
fmt.Fprintf(os.Stderr, "ngram len:\t%s\n", *ngramFlag)
336-
fmt.Fprintf(os.Stderr, "Crawl delay:\t%dms (increase this to avoid rate limiting)\n", *delayFlag)
369+
fmt.Fprintf(os.Stderr, "Crawl delay:\t%dms (increase to avoid rate limiting)\n", *delayFlag)
337370
fmt.Fprintf(os.Stderr, "Timeout:\t%d sec\n", *timeoutFlag)
338371
}
339372

@@ -370,15 +403,17 @@ func main() {
370403
defer wg.Done()
371404
ticker := time.NewTicker(50 * time.Millisecond)
372405
defer ticker.Stop()
373-
totalCrawled := 1
406+
totalCrawled := 0
374407
for {
375408
select {
376409
case <-ticker.C:
377410
fmt.Fprintf(os.Stderr, "\rURLs crawled:\t%d", totalCrawled)
378411
case count := <-urlCountChan:
379412
totalCrawled += count
380413
case <-doneChan:
381-
fmt.Fprintf(os.Stderr, "\rURLs crawled:\t%d", totalCrawled)
414+
if totalCrawled > 0 {
415+
fmt.Fprintf(os.Stderr, "\rURLs crawled:\t%d", totalCrawled)
416+
}
382417
return
383418
}
384419
}
@@ -388,7 +423,7 @@ func main() {
388423
wg.Add(1)
389424
go func() {
390425
defer wg.Done()
391-
crawlAndScrape(*urlFlag, *crawlFlag, *delayFlag, timeoutDur, urlCountChan, textsChan, visitedURLs)
426+
crawlAndScrape(*urlFlag, *crawlFlag, *delayFlag, timeoutDur, urlCountChan, textsChan, visitedURLs, urlMatchStr)
392427
time.Sleep(100 * time.Millisecond)
393428
close(textsChan)
394429
close(doneChan)
@@ -401,6 +436,17 @@ func main() {
401436
for text := range textsChan {
402437
texts = append(texts, text)
403438
}
439+
440+
// if nothing matched, exit early
441+
if len(texts) == 0 {
442+
time.Sleep(100)
443+
fmt.Fprintln(os.Stderr, "No URLs crawled, exiting...") // boo, something went wrong!
444+
if *crawlFlag == 1 {
445+
fmt.Fprintln(os.Stderr, "Try increasing -crawl depth, or remove -url-match")
446+
}
447+
return
448+
}
449+
404450
totalTexts := len(texts)
405451

406452
// set up progress bar ticker

0 commit comments

Comments
 (0)