Skip to content

Commit fb3d7a7

Browse files
committed
WIP: chromedp transport integration for netnaija
1 parent 22213c6 commit fb3d7a7

File tree

4 files changed

+92
-11
lines changed

4 files changed

+92
-11
lines changed

engine/engines.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,10 @@ type Engine interface {
5555
// Scrape : Parse queries a url and return results
5656
func Scrape(engine Engine) ([]Movie, error) {
5757
// Config Vars
58-
seleniumURL := fmt.Sprintf("%s/wd/hub", viper.GetString("selenium-url"))
58+
// seleniumURL := fmt.Sprintf("%s/wd/hub", viper.GetString("selenium-url"))
5959
cacheDir := viper.GetString("cache-dir")
6060
var (
61-
t *transport.Transport
61+
t *transport.ChromeDpTransport
6262
err error
6363
)
6464

@@ -70,8 +70,8 @@ func Scrape(engine Engine) ([]Movie, error) {
7070

7171
// Add Cloud Flare scraper bypasser
7272
if engine.getName() == "NetNaija" {
73-
log.Debug("Switching to Selenium transport")
74-
t, err = transport.NewSeleniumTransport(http.DefaultTransport, seleniumURL)
73+
log.Debug("Switching to ChromeDpTransport")
74+
t, err = transport.NewChromeDpTransport(http.DefaultTransport)
7575
if err != nil {
7676
log.Fatal(err)
7777
}
@@ -81,7 +81,7 @@ func Scrape(engine Engine) ([]Movie, error) {
8181
// Close the WebDriver Instance
8282
defer func() {
8383
if engine.getName() == "NetNaija" {
84-
t.WebDriver.Quit()
84+
t.Cancel()
8585
}
8686
}()
8787

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ go 1.14
55
require (
66
github.com/bisoncorps/mplayer v0.0.0-20200330192254-e2f647162350
77
github.com/briandowns/spinner v1.11.1
8+
github.com/chromedp/chromedp v0.5.3
89
github.com/gocolly/colly v1.2.0
910
github.com/gocolly/colly/v2 v2.0.2-0.20200302170631-ef2d2b016e78
1011
github.com/iawia002/annie v0.0.0-20200217104547-c4b096ad402a

go.sum

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ github.com/briandowns/spinner v1.11.1/go.mod h1:QOuQk7x+EaDASo80FEXwlwiA+j/PPIcX
5050
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
5151
github.com/cheggaaa/pb v1.0.25 h1:tFpebHTkI7QZx1q1rWGOKhbunhZ3fMaxTvHDWn1bH/4=
5252
github.com/cheggaaa/pb v1.0.25/go.mod h1:pQciLPpbU0oxA0h+VJYYLxO+XeDQb5pZijXscXHm81s=
53+
github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac h1:T7V5BXqnYd55Hj/g5uhDYumg9Fp3rMTS6bykYtTIFX4=
54+
github.com/chromedp/cdproto v0.0.0-20200116234248-4da64dd111ac/go.mod h1:PfAWWKJqjlGFYJEidUM6aVIWPr0EpobeyVWEEmplX7g=
55+
github.com/chromedp/chromedp v0.5.3 h1:F9LafxmYpsQhWQBdCs+6Sret1zzeeFyHS5LkRF//Ffg=
56+
github.com/chromedp/chromedp v0.5.3/go.mod h1:YLdPtndaHQ4rCpSpBG+IPpy9JvX0VD+7aaLxYgYj28w=
5357
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
5458
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e h1:fY5BOSpyZCqRo5OhCuC+XN+r/bBCmeuuJtjz+bCNIf8=
5559
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
@@ -80,6 +84,12 @@ github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V
8084
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
8185
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
8286
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
87+
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee h1:s+21KNqlpePfkah2I+gwHF8xmJWRjooY+5248k6m4A0=
88+
github.com/gobwas/httphead v0.0.0-20180130184737-2c6c146eadee/go.mod h1:L0fX3K22YWvt/FAX9NnzrNzcI4wNYi9Yku4O0LKYflo=
89+
github.com/gobwas/pool v0.2.0 h1:QEmUOlnSjWtnpRGHF3SauEiOsy82Cup83Vf2LcMlnc8=
90+
github.com/gobwas/pool v0.2.0/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
91+
github.com/gobwas/ws v1.0.2 h1:CoAavW/wd/kulfZmSIBt6p24n4j7tHgNVCjsfHVNUbo=
92+
github.com/gobwas/ws v1.0.2/go.mod h1:szmBTxLgaFppYjEmNtny/v3w89xOydFnnZMcgRRu/EM=
8393
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
8494
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
8595
github.com/gocolly/colly/v2 v2.0.2-0.20200302170631-ef2d2b016e78 h1:BEK0DJ6e4lXgbYrClpJpa9Bh3IT7HunTNExaP9Y+6gI=
@@ -150,6 +160,8 @@ github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8Nz
150160
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
151161
github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
152162
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
163+
github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08 h1:V0an7KRw92wmJysvFvtqtKMAPmvS5O0jtB0nYo6t+gs=
164+
github.com/knq/sysutil v0.0.0-20191005231841-15668db23d08/go.mod h1:dFWs1zEqDjFtnBXsd1vPOZaLsESovai349994nHx3e0=
153165
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
154166
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
155167
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
@@ -163,6 +175,8 @@ github.com/lunixbochs/vtclean v0.0.0-20180621232353-2d01aacdc34a/go.mod h1:pHhQN
163175
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
164176
github.com/magiconair/properties v1.8.1 h1:ZC2Vc7/ZFkGmsVC9KvOjumD+G5lXy2RtTKyzRKO2BQ4=
165177
github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
178+
github.com/mailru/easyjson v0.7.0 h1:aizVhC/NAAcKWb+5QsU1iNOZb4Yws5UO2I+aIprQITM=
179+
github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs=
166180
github.com/manifoldco/promptui v0.7.0 h1:3l11YT8tm9MnwGFQ4kETwkzpAwY2Jt9lCrumCUW4+z4=
167181
github.com/manifoldco/promptui v0.7.0/go.mod h1:n4zTdgP0vr0S3w7/O/g98U+e0gwLScEXGwov2nIKuGQ=
168182
github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU=
@@ -342,6 +356,8 @@ golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7w
342356
golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
343357
golang.org/x/sys v0.0.0-20191104094858-e8c54fb511f6 h1:ZJUmhYTp8GbGC0ViZRc2U+MIYQ8xx9MscsdXnclfIhw=
344358
golang.org/x/sys v0.0.0-20191104094858-e8c54fb511f6/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
359+
golang.org/x/sys v0.0.0-20200116001909-b77594299b42 h1:vEOn+mP2zCOVzKckCZy6YsCtDblrpj/w7B9nxGNELpg=
360+
golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
345361
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
346362
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
347363
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=

transport/transport.go

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@ package transport
22

33
import (
44
"bytes"
5+
"context"
56
"fmt"
6-
"os"
77
"io/ioutil"
88
"net/http"
9+
"os"
910
"strings"
1011
"time"
1112

13+
"github.com/chromedp/chromedp"
14+
log "github.com/sirupsen/logrus"
1215
"github.com/tebeka/selenium"
1316
"github.com/tebeka/selenium/firefox"
1417
)
@@ -17,11 +20,17 @@ const (
1720
userAgent = `Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36`
1821
)
1922

20-
type Transport struct {
23+
type SeleniumTransport struct {
2124
upstream http.RoundTripper
2225
WebDriver selenium.WebDriver
2326
}
2427

28+
type ChromeDpTransport struct {
29+
upstream http.RoundTripper
30+
Ctx context.Context
31+
Cancel context.CancelFunc
32+
}
33+
2534
func NewClient() (c *http.Client, err error) {
2635
seleniumURL := fmt.Sprintf("%s/wd/hub", os.Getenv("GOPHIE_SELENIUM_URL"))
2736
fmt.Println("selenium url " + seleniumURL)
@@ -37,23 +46,78 @@ func NewClient() (c *http.Client, err error) {
3746
return
3847
}
3948

40-
func NewSeleniumTransport(upstream http.RoundTripper, seleniumURL string) (*Transport, error) {
49+
func NewSeleniumTransport(upstream http.RoundTripper, seleniumURL string) (*SeleniumTransport, error) {
4150

4251
caps := selenium.Capabilities{"browserName": "firefox"}
4352
firefoxCaps := firefox.Capabilities{Args: []string{"-headless"}}
4453
caps.AddFirefox(firefoxCaps)
4554
wd, err := selenium.NewRemote(caps, seleniumURL)
4655

4756
if err != nil {
48-
return &Transport{}, err
57+
return &SeleniumTransport{}, err
4958
}
50-
return &Transport{
59+
return &SeleniumTransport{
5160
upstream: upstream,
5261
WebDriver: wd,
5362
}, nil
5463
}
5564

56-
func (t *Transport) RoundTrip(r *http.Request) (*http.Response, error) {
65+
func NewChromeDpTransport(upstream http.RoundTripper) (*ChromeDpTransport, error) {
66+
67+
ctx, cancel := chromedp.NewContext(
68+
context.Background(),
69+
chromedp.WithLogf(log.Debugf),
70+
)
71+
72+
return &ChromeDpTransport{
73+
upstream: upstream,
74+
Ctx: ctx,
75+
Cancel: cancel,
76+
}, nil
77+
}
78+
79+
func (t *ChromeDpTransport) RoundTrip(r *http.Request) (*http.Response, error) {
80+
var (
81+
body string
82+
err error
83+
)
84+
85+
if r.Header.Get("User-Agent") == "" {
86+
r.Header.Set("User-Agent", userAgent)
87+
}
88+
89+
if r.Header.Get("Referer") == "" {
90+
r.Header.Set("Referer", r.URL.String())
91+
}
92+
93+
r.Header.Set("Content-Type", "text/html")
94+
95+
log.Debug("Set Headers for page ", r.URL.String())
96+
97+
if err = chromedp.Run(t.Ctx,
98+
chromedp.Navigate(r.URL.String()),
99+
chromedp.WaitVisible(`main`),
100+
chromedp.OuterHTML("html", &body),
101+
); err != nil {
102+
return &http.Response{}, err
103+
}
104+
log.Debug("Successfully retrieved body")
105+
106+
response := &http.Response{
107+
Status: "200 OK",
108+
StatusCode: 200,
109+
Proto: "HTTP/1.1",
110+
ProtoMajor: 1,
111+
ProtoMinor: 1,
112+
Body: ioutil.NopCloser(bytes.NewBufferString(body)),
113+
ContentLength: int64(len(body)),
114+
Request: r,
115+
Header: r.Header,
116+
}
117+
return response, nil
118+
}
119+
120+
func (t *SeleniumTransport) RoundTrip(r *http.Request) (*http.Response, error) {
57121
var (
58122
title string
59123
body string

0 commit comments

Comments
 (0)