Skip to content

抓取soup乱码 #32

@take7yo

Description

@take7yo

from calibre.web.feeds.recipes import BasicNewsRecipe

class Python_Tutorial(BasicNewsRecipe):

title = 'Python Tutorial'
description = ''
cover_url = 'http://www.runoob.com/wp-content/uploads/2013/11/python.jpg'

url_prefix = 'http://www.runoob.com'
no_stylesheets = True
# 添加encoding也不行
encoding = 'utf-8'
keep_only_tags = [{ 'class': 'article-intro' }]

def get_title(self, link):
    return link.contents[0].strip()

def parse_index(self):
    soup = self.index_to_soup(self.url_prefix + '/python/python-tutorial.html')
    # 这里打印soup是乱码
    print(soup)
    div = soup.find('div', { 'id': 'leftcolumn' })

    articles = []
    for link in div.findAll('a'):
        if '#' in link['href']:
            continue

        if not '/python' in link['href']:
            continue

        til = self.get_title(link)
        url = self.url_prefix + link['href']
        a = { 'title': til, 'url': url }

        articles.append(a)

    ans = [('Python_Tutorial', articles)]

    return ans

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions