Skip to content
This repository was archived by the owner on Feb 17, 2023. It is now read-only.

Commit 36feb06

Browse files
committed
Merge pull request #26 from crackcomm/master
Improved bot recognition
2 parents ea7cf3c + e25e612 commit 36feb06

File tree

2 files changed

+24
-3
lines changed

2 files changed

+24
-3
lines changed

all_test.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,26 @@ var uastrings = []struct {
6262
ua: "Facebot",
6363
expected: "Browser:Facebot Bot:true Mobile:false",
6464
},
65+
{
66+
title: "NutchCVS",
67+
ua: "NutchCVS/0.8-dev (Nutch; http://lucene.apache.org/nutch/bot.html; [email protected])",
68+
expected: "Browser:NutchCVS Bot:true Mobile:false",
69+
},
70+
{
71+
title: "MJ12bot",
72+
ua: "Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php?+)",
73+
expected: "Mozilla:5.0 Browser:MJ12bot-v1.2.4 Bot:true Mobile:false",
74+
},
75+
{
76+
title: "MJ12bot",
77+
ua: "MJ12bot/v1.0.8 (http://majestic12.co.uk/bot.php?+)",
78+
expected: "Browser:MJ12bot Bot:true Mobile:false",
79+
},
80+
{
81+
title: "AhrefsBot",
82+
ua: "Mozilla/5.0 (compatible; AhrefsBot/4.0; +http://ahrefs.com/robot/)",
83+
expected: "Mozilla:5.0 Browser:AhrefsBot-4.0 Bot:true Mobile:false",
84+
},
6585

6686
// Internet Explorer
6787
{

bot.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ func (p *UserAgent) fixOther(sections []section) {
7575
}
7676
}
7777

78+
var botRegex = regexp.MustCompile("(?i)(bot|crawler|sp(i|y)der|search|worm|fetch|nutch)")
79+
7880
// Check if we're dealing with a bot or with some weird browser. If that is the
7981
// case, the receiver will be modified accordingly.
8082
func (p *UserAgent) checkBot(sections []section) {
@@ -83,9 +85,8 @@ func (p *UserAgent) checkBot(sections []section) {
8385
if len(sections) == 1 && sections[0].name != "Mozilla" {
8486
p.mozilla = ""
8587

86-
// Check whether the name has some suspicious "bot" in his name.
87-
reg, _ := regexp.Compile("(?i)bot")
88-
if reg.Match([]byte(sections[0].name)) {
88+
// Check whether the name has some suspicious "bot" or "crawler" in his name.
89+
if botRegex.Match([]byte(sections[0].name)) {
8990
p.setSimple(sections[0].name, "", true)
9091
return
9192
}

0 commit comments

Comments
 (0)