8
8
"net/http"
9
9
"net/url"
10
10
"os"
11
- "os/exec "
11
+ "path "
12
12
"path/filepath"
13
13
"runtime"
14
14
"sort"
@@ -23,12 +23,13 @@ import (
23
23
/*
24
24
cyclone's url spider
25
25
spider will crawl a url and create a wordlist, or use flag -ngram to create ngrams
26
- version 0.5.10; initial github release
27
- version 0.6.2;
26
+ v0.5.10;
27
+ initial github release
28
+ v0.6.2;
28
29
fixed scraping logic & ngram creations bugs
29
30
switched from gocolly to goquery for web scraping
30
31
remove dups from word / ngrams output
31
- version 0 .7.0;
32
+ v0 .7.0;
32
33
added feature to allow crawling specific file extensions (html, htm, txt)
33
34
added check to keep crawler from crawling offsite URLs
34
35
added flag "-delay" to avoid rate limiting (-delay 100 == 100ms delay between URL requests)
@@ -38,20 +39,32 @@ version 0.7.0;
38
39
fixed bug when attempting to crawl deeper than available URLs to crawl
39
40
fixed crawl depth calculation
40
41
optimized code which runs 2.8x faster vs v0.6.x during bench testing
41
- version 0 .7.1;
42
+ v0 .7.1;
42
43
added progress bars to word / ngrams processing & file writing operations
43
44
added RAM usage monitoring
44
45
optimized order of operations for faster processing with less RAM
45
46
TO-DO: refactor code (func main is getting messy)
46
- TO-DO: add -file flag to allow crawling local plaintext files such as an ebook.txt
47
+ TO-DO: add -file flag to allow crawling local plaintext files such as an ebook.txt (COMPLETED in v0.8.0)
47
48
v0.8.0;
48
49
added flag "-file" to allow creating ngrams from a local plaintext file (ex: foobar.txt)
49
50
added flag "-timeout" for -url mode
50
51
added flag "-sort" which sorts output by frequency
51
52
fixed several small bugs
53
+ v0.8.1;
54
+ updated default -delay to 10ms
55
+ v0.9.0;
56
+ added flag "-url-match" to only crawl URLs containing a specified keyword; https://github.com/cyclone-github/spider/issues/6
57
+ added notice to user if no URLs are crawled when using "-crawl 1 -url-match"
58
+ exit early if zero URLs were crawled (no processing or file output)
59
+ use custom User-Agent "Spider/0.9.0 (+https://github.com/cyclone-github/spider)"
60
+ removed clearScreen function and its imports
61
+ fixed crawl-depth calculation logic
62
+ fixed restrict link collection to .html, .htm, .txt and extension-less paths
63
+ upgraded dependencies and bumped Go version to v1.24.3
52
64
*/
53
65
54
66
// clear screen function
67
+ /*
55
68
func clearScreen() {
56
69
var cmd *exec.Cmd
57
70
@@ -71,6 +84,7 @@ func clearScreen() {
71
84
os.Exit(1)
72
85
}
73
86
}
87
+ */
74
88
75
89
// goquery
76
90
func getDocumentFromURL (targetURL string , timeout time.Duration ) (* goquery.Document , bool , error ) {
@@ -79,7 +93,7 @@ func getDocumentFromURL(targetURL string, timeout time.Duration) (*goquery.Docum
79
93
if err != nil {
80
94
return nil , false , err
81
95
}
82
- req .Header .Set ("User-Agent" , "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3 " )
96
+ req .Header .Set ("User-Agent" , "Spider/0.9.0 (+https://github.com/cyclone-github/spider) " )
83
97
84
98
res , err := client .Do (req )
85
99
if err != nil {
@@ -106,16 +120,23 @@ func hasAnySuffix(s string, suffixes []string) bool {
106
120
107
121
func getLinksFromDocument (doc * goquery.Document , baseURL string ) []string {
108
122
var links []string
109
- validSuffixes := []string {".html" , ".htm" , ".txt" } // specifically crawl file types, ex: if listed in a file server
110
-
111
- doc .Find ("a[href]" ).Each (func (index int , item * goquery.Selection ) {
112
- link , exists := item .Attr ("href" )
113
- if exists {
114
- absoluteLink := joinURL (baseURL , link ) // convert to absolute URL
115
- // crawl any non-anchor or valid-file-type link
116
- if hasAnySuffix (link , validSuffixes ) || ! strings .HasPrefix (link , "#" ) {
117
- links = append (links , absoluteLink )
118
- }
123
+ validSuffixes := map [string ]bool {
124
+ ".html" : true ,
125
+ ".htm" : true ,
126
+ ".txt" : true ,
127
+ }
128
+
129
+ doc .Find ("a[href]" ).Each (func (_ int , item * goquery.Selection ) {
130
+ href , exists := item .Attr ("href" )
131
+ if ! exists || strings .HasPrefix (href , "#" ) {
132
+ return
133
+ }
134
+ absoluteLink := joinURL (baseURL , href )
135
+
136
+ // only allow approved extensions or none at all
137
+ ext := strings .ToLower (path .Ext (absoluteLink ))
138
+ if ext == "" || validSuffixes [ext ] {
139
+ links = append (links , absoluteLink )
119
140
}
120
141
})
121
142
return links
@@ -128,7 +149,7 @@ func getTextFromDocument(doc *goquery.Document) string {
128
149
return doc .Text ()
129
150
}
130
151
131
- func crawlAndScrape (u string , depth int , delay int , timeout time.Duration , urlCountChan chan <- int , textsChan chan <- string , visited map [string ]bool ) {
152
+ func crawlAndScrape (u string , depth int , delay int , timeout time.Duration , urlCountChan chan <- int , textsChan chan <- string , visited map [string ]bool , urlMatchStr string ) {
132
153
if visited [u ] {
133
154
return
134
155
}
@@ -142,28 +163,37 @@ func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, urlCo
142
163
if ! isSuccess {
143
164
return
144
165
}
145
- urlCountChan <- 1 // URL processed
146
166
147
- text := getTextFromDocument (doc )
148
- textsChan <- text // send the text for later n-gram processing
167
+ // only count & scrape text if it contains -url-match
168
+ if urlMatchStr == "" || strings .Contains (strings .ToLower (u ), urlMatchStr ) {
169
+ urlCountChan <- 1 // URL processed
170
+ textsChan <- getTextFromDocument (doc ) // send the text for later n-gram processing
171
+ }
149
172
150
173
if depth > 1 {
151
174
baseDomain , err := getBaseDomain (u )
152
175
if err != nil {
153
176
fmt .Fprintf (os .Stderr , "Error getting base domain: %v\n " , err )
154
177
return
155
178
}
156
- links := getLinksFromDocument (doc , u )
157
- for _ , link := range links {
179
+ for _ , link := range getLinksFromDocument (doc , u ) {
158
180
time .Sleep (time .Duration (delay ) * time .Millisecond )
181
+
159
182
linkDomain , err := getBaseDomain (link )
160
183
if err != nil {
161
184
fmt .Fprintf (os .Stderr , "Error parsing link %s: %v\n " , link , err )
162
185
continue
163
186
}
164
- if linkDomain == baseDomain {
165
- crawlAndScrape (link , depth - 1 , delay , timeout , urlCountChan , textsChan , visited )
187
+ if linkDomain != baseDomain {
188
+ continue
189
+ }
190
+
191
+ // only *descend* into children that match (if urlMatchStr was provided)
192
+ if urlMatchStr != "" && ! strings .Contains (strings .ToLower (link ), urlMatchStr ) {
193
+ continue
166
194
}
195
+
196
+ crawlAndScrape (link , depth - 1 , delay , timeout , urlCountChan , textsChan , visited , urlMatchStr )
167
197
}
168
198
}
169
199
}
@@ -225,7 +255,7 @@ func monitorRAMUsage(stopChan chan bool, maxRAMUsage *float64) {
225
255
226
256
// main function
227
257
func main () {
228
- clearScreen ()
258
+ // clearScreen()
229
259
230
260
cycloneFlag := flag .Bool ("cyclone" , false , "Display coded message" )
231
261
versionFlag := flag .Bool ("version" , false , "Display version" )
@@ -234,9 +264,10 @@ func main() {
234
264
ngramFlag := flag .String ("ngram" , "1" , "Lengths of n-grams (e.g., \" 1-3\" for 1, 2, and 3-length n-grams)." )
235
265
oFlag := flag .String ("o" , "" , "Output file for the n-grams" )
236
266
crawlFlag := flag .Int ("crawl" , 1 , "Depth of links to crawl" )
237
- delayFlag := flag .Int ("delay" , 0 , "Delay in ms between each URL lookup to avoid rate limiting" )
267
+ delayFlag := flag .Int ("delay" , 10 , "Delay in ms between each URL lookup to avoid rate limiting" )
238
268
timeoutFlag := flag .Int ("timeout" , 1 , "Timeout for URL crawling in seconds" )
239
269
sortFlag := flag .Bool ("sort" , false , "Sort output by frequency" )
270
+ urlMatchFlag := flag .String ("url-match" , "" , "Only crawl URLs containing this keyword (case-insensitive)" )
240
271
flag .Parse ()
241
272
242
273
if * cycloneFlag {
@@ -246,7 +277,7 @@ func main() {
246
277
os .Exit (0 )
247
278
}
248
279
if * versionFlag {
249
- version := "Cyclone's URL Spider v0.8 .0"
280
+ version := "Cyclone's URL Spider v0.9 .0"
250
281
fmt .Fprintln (os .Stderr , version )
251
282
os .Exit (0 )
252
283
}
@@ -259,6 +290,8 @@ func main() {
259
290
}
260
291
fileMode := * fileFlag != ""
261
292
293
+ urlMatchStr := strings .ToLower (* urlMatchFlag )
294
+
262
295
var baseDomain string
263
296
if ! fileMode {
264
297
// URL mode
@@ -333,7 +366,7 @@ func main() {
333
366
fmt .Fprintf (os .Stderr , "Base domain:\t %s\n " , baseDomain )
334
367
fmt .Fprintf (os .Stderr , "Crawl depth:\t %d\n " , * crawlFlag )
335
368
fmt .Fprintf (os .Stderr , "ngram len:\t %s\n " , * ngramFlag )
336
- fmt .Fprintf (os .Stderr , "Crawl delay:\t %dms (increase this to avoid rate limiting)\n " , * delayFlag )
369
+ fmt .Fprintf (os .Stderr , "Crawl delay:\t %dms (increase to avoid rate limiting)\n " , * delayFlag )
337
370
fmt .Fprintf (os .Stderr , "Timeout:\t %d sec\n " , * timeoutFlag )
338
371
}
339
372
@@ -370,15 +403,17 @@ func main() {
370
403
defer wg .Done ()
371
404
ticker := time .NewTicker (50 * time .Millisecond )
372
405
defer ticker .Stop ()
373
- totalCrawled := 1
406
+ totalCrawled := 0
374
407
for {
375
408
select {
376
409
case <- ticker .C :
377
410
fmt .Fprintf (os .Stderr , "\r URLs crawled:\t %d" , totalCrawled )
378
411
case count := <- urlCountChan :
379
412
totalCrawled += count
380
413
case <- doneChan :
381
- fmt .Fprintf (os .Stderr , "\r URLs crawled:\t %d" , totalCrawled )
414
+ if totalCrawled > 0 {
415
+ fmt .Fprintf (os .Stderr , "\r URLs crawled:\t %d" , totalCrawled )
416
+ }
382
417
return
383
418
}
384
419
}
@@ -388,7 +423,7 @@ func main() {
388
423
wg .Add (1 )
389
424
go func () {
390
425
defer wg .Done ()
391
- crawlAndScrape (* urlFlag , * crawlFlag , * delayFlag , timeoutDur , urlCountChan , textsChan , visitedURLs )
426
+ crawlAndScrape (* urlFlag , * crawlFlag , * delayFlag , timeoutDur , urlCountChan , textsChan , visitedURLs , urlMatchStr )
392
427
time .Sleep (100 * time .Millisecond )
393
428
close (textsChan )
394
429
close (doneChan )
@@ -401,6 +436,17 @@ func main() {
401
436
for text := range textsChan {
402
437
texts = append (texts , text )
403
438
}
439
+
440
+ // if nothing matched, exit early
441
+ if len (texts ) == 0 {
442
+ time .Sleep (100 )
443
+ fmt .Fprintln (os .Stderr , "No URLs crawled, exiting..." ) // boo, something went wrong!
444
+ if * crawlFlag == 1 {
445
+ fmt .Fprintln (os .Stderr , "Try increasing -crawl depth, or remove -url-match" )
446
+ }
447
+ return
448
+ }
449
+
404
450
totalTexts := len (texts )
405
451
406
452
// set up progress bar ticker
0 commit comments