Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions xgoogle/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# Code is licensed under MIT license.
#

import sys
import ssl
import random
import socket
Expand All @@ -18,6 +19,7 @@
import http.cookiejar
import http.cookies


BROWSERS = (
# Top most popular browsers in my access.log on 2009.02.12
# tail -50000 access.log |
Expand All @@ -42,7 +44,7 @@
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
)

TIMEOUT = 5 # socket timeout
TIMEOUT_SOCKET = 5 # socket timeout

class BrowserError(Exception):
def __init__(self, url, error):
Expand All @@ -52,6 +54,7 @@ def __init__(self, url, error):
class PoolHTTPConnection(http.client.HTTPConnection):
def connect(self):
"""Connect to the host and port specified in __init__."""
global TIMEOUT_SOCKET
msg = "getaddrinfo returns an empty list"
for res in socket.getaddrinfo(self.host, self.port, 0,
socket.SOCK_STREAM):
Expand All @@ -60,7 +63,7 @@ def connect(self):
self.sock = socket.socket(af, socktype, proto)
if self.debuglevel > 0:
print("connect: (%s, %s)" % (self.host, self.port))
self.sock.settimeout(TIMEOUT)
self.sock.settimeout(TIMEOUT_SOCKET)
self.sock.connect(sa)
except socket.error as msg:
if self.debuglevel > 0:
Expand All @@ -80,7 +83,9 @@ def http_open(self, req):
class Browser(object):
"""Provide a simulated browser object.
"""
def __init__(self, user_agent=BROWSERS[0], debug=False, use_pool=False):
def __init__(self, timeout, user_agent=BROWSERS[0], debug=False, use_pool=False):
global TIMEOUT_SOCKET
TIMEOUT_SOCKET = timeout
self.headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
Expand Down Expand Up @@ -112,15 +117,15 @@ def get_page(self, url, data=None):
try:
response = self.opener.open(request)
return response.read()
except (urllib.error.HTTPError) as e:
except urllib.error.HTTPError as e:
# Check if we've reached the captcha
if e.code == 503:
print("Error: Captcha page has been reached, exiting...")
sys.exit(1)
raise BrowserError(url, str(e))
except (urllib.error.URLError) as e:
except urllib.error.URLError as e:
raise BrowserError(url, str(e))
except (socket.error, socket.sslerror) as msg:
except (socket.error, ssl.SSLError) as msg:
raise BrowserError(url, msg)
except socket.timeout as e:
raise BrowserError(url, "timeout")
Expand Down
32 changes: 16 additions & 16 deletions xgoogle/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ class GoogleSearch(object):
SEARCH_URL_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&btnG=Google+Search"
NEXT_PAGE_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"

def __init__(self, query, random_agent=True, debug=False, lang="en", tld="com", re_search_strings=None, repeat=None):
def __init__(self, query, random_agent=True, debug=False, lang="en", tld="com", re_search_strings=None, repeat=None, timeout=5):
self.query = query
self.debug = debug
self.browser = Browser(debug=debug)
self.browser = Browser(debug=debug, timeout=timeout)
self.results_info = None
self.eor = False # end of results
self._page = 0
Expand Down Expand Up @@ -358,14 +358,14 @@ def _html_unescape(self, str):
def entity_replacer(m):
entity = m.group(1)
if entity in name2codepoint:
return unichr(name2codepoint[entity])
return chr(name2codepoint[entity])
else:
return m.group(0)

def ascii_replacer(m):
cp = int(m.group(1))
if cp <= 255:
return unichr(cp)
return chr(cp)
else:
return m.group(0)

Expand All @@ -378,10 +378,10 @@ class GoogleVideoSearch(object):
SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d"
NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"

def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None):
def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5):
self.query = query
self.debug = debug
self.browser = Browser(debug=debug)
self.browser = Browser(debug=debug, timeout=timeout)
self.results_info = None
self.eor = False # end of results
self._page = 0
Expand Down Expand Up @@ -626,14 +626,14 @@ def _html_unescape(self, str):
def entity_replacer(m):
entity = m.group(1)
if entity in name2codepoint:
return unichr(name2codepoint[entity])
return chr(name2codepoint[entity])
else:
return m.group(0)

def ascii_replacer(m):
cp = int(m.group(1))
if cp <= 255:
return unichr(cp)
return chr(cp)
else:
return m.group(0)

Expand All @@ -646,10 +646,10 @@ class GoogleImageSearch(object):
SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d"
NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"

def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None):
def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5):
self.query = query
self.debug = debug
self.browser = Browser(debug=debug)
self.browser = Browser(debug=debug, timeout=timeout)
self.results_info = None
self.eor = False # end of results
self._page = 0
Expand Down Expand Up @@ -880,14 +880,14 @@ def _html_unescape(self, str):
def entity_replacer(m):
entity = m.group(1)
if entity in name2codepoint:
return unichr(name2codepoint[entity])
return chr(name2codepoint[entity])
else:
return m.group(0)

def ascii_replacer(m):
cp = int(m.group(1))
if cp <= 255:
return unichr(cp)
return chr(cp)
else:
return m.group(0)

Expand All @@ -901,10 +901,10 @@ class GoogleFaceImageSearch(object):
SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d"
NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"

def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None):
def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5):
self.query = query
self.debug = debug
self.browser = Browser(debug=debug)
self.browser = Browser(debug=debug, timeout=timeout)
self.results_info = None
self.eor = False # end of results
self._page = 0
Expand Down Expand Up @@ -1149,14 +1149,14 @@ def _html_unescape(self, str):
def entity_replacer(m):
entity = m.group(1)
if entity in name2codepoint:
return unichr(name2codepoint[entity])
return chr(name2codepoint[entity])
else:
return m.group(0)

def ascii_replacer(m):
cp = int(m.group(1))
if cp <= 255:
return unichr(cp)
return chr(cp)
else:
return m.group(0)

Expand Down