2.4. 网络爬虫模型

目前为止,我们处理的都是单个静态页面,然而在实际运用中,我们常常需要在一个根页面的基础上通过内链进入网站的其他界面以获取更多数据,拥有这种功能的程序可以称之为网络爬虫。

之所以叫网络爬虫,是因为它们可以在 Web 上“爬行”。它们 本质上是一种递归方式

2.4.1. 一个获取惠州学院新闻网要闻的简单爬虫模型

2.4.1.1. 新闻数据结构

class Image:
    """
    一个网络中的图片的数据对象
    """

    imgLink = ""
    imgTitle = ""

    def __init__(self, title, link):
        self.imgLink = link
        self.imgTitle = title

    def get_link(self):
        return self.imgLink

    def get_title(self):
        return self.imgTitle

    def set_title(self, title):
        self.imgTitle = title

    def set_link(self, link):
        self.imgLink = link
import string


class News:
    """
    一个新闻对象,存储新闻标题信息以及具体内容,并存储外链图片;当新闻链接为空字符串时,
    该新闻对象是一个存有文章;当新闻链接不为空时,该新闻对象是一个外链新闻。
    """

    """文章段落文本组"""
    paragraphs = []

    """外链图片信息组"""
    images = []

    """新闻详细介绍链接"""
    pageLink = ""

    """新闻标题"""
    title = ""

    def __init__(self, title):
        self.title = title

    def get_title(self):
        return self.title

    def set_title(self, title):
        self.title = title

    def push_paragraph(self, paragraph: string):
        self.paragraphs.append(paragraph)

    def pop_paragraph(self):
        return self.paragraphs.pop()

    def push_image(self, image: Image):
        self.images.append(image)

    def pop_image(self):
        return self.images.pop()

    def get_link(self):
        return self.pageLink

    def set_link(self, link):
        self.pageLink = link

    def set_paragraphs(self, paragraphs):
        self.paragraphs = paragraphs[:]

    def set_images(self, images):
        self.images = images[:]

    def get_images(self):
        return self.images[:]

    def get_paragraphs(self):
        return self.paragraphs[:]

    def print_news_content(self):
        print("Title :" + self.title)
        print("Hyperlink : "+self.pageLink)
        print("Here is article".center(30, '-'))
        for paragraph in self.paragraphs:
            print(paragraph)
        print("".center(30, '-'))
        print("images len is "+str(len(self.images)))
        for image in self.images:
            print(image.get_title() + ":" + image.get_link())

2.4.1.2. 新闻的爬虫模型 1

# get some information from school's website
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
from bs4 import BeautifulSoup

# 定义常量惠州学院新闻网 https://news.hzu.edu.cn
HZU_NEWS_LINK = "https://news.hzu.edu.cn"


def get_news_from_hzu():
    """
    获取惠州学院新闻网的学校要闻的新闻信息集合。

    :return: 一个包含链接和文字的新闻列表。
        若为空,说明该方法需要更新。
    """
    try:
        html = urlopen(HZU_NEWS_LINK)
        bs = BeautifulSoup(html.read(), 'html.parser')
        # 获取新闻链接以及文本
        items = bs.find('div', {'class': {'post-body'}}) \
            .find('ul', {'class': {'post-news'}}) \
            .findAll('li', {'class': {'xxyw-news-item'}})
        news_list = []
        for item in items:
            title = item.a['title']
            link = item.a['href']
            news = News(title)
            # 判断新闻链接是否完整,如果不完整则补充前缀
            if "http" not in link:
                news = __get_news_link_content(news, HZU_NEWS_LINK + link)
            else:
                news.set_link(link)
            news_list.append(news)
        return news_list
    except AttributeError as e:
        print(e)
        print('某个标签元素不存在 或者url错误(服务器不存在)导致html.read()出错')
        return None
    except HTTPError as e:
        print(e)
        print('The page is not exist or have a error in getting page.')
        return None
    except URLError as e:
        print(e)
        print("url is wrong or the url couldn't open.")
        return None


def __get_news_link_content(news: News, article_url):
    try:
        html = urlopen(article_url)
        print("open sub url")
    except HTTPError as e:
        print(e)
        print('The page is not exist or have a error in getting page.')
        return news
    except URLError as e:
        print(e)
        print("url is wrong or the url couldn't open.")
        return news
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')
        # 获取新闻文章主体
        article = bs.find('div', {'class': 'wp_articlecontent'})
        images = []
        paragraphs = []
        # 找到主体文本
        for item in article.find_all(style='text-indent:2em;text-align:left;'):
            paragraphs.append(item.get_text())
        for item in article.find_all('img', {'data-layer': "photo"}):
            link = item['src']
            if "http" not in link:
                link = HZU_NEWS_LINK + link
            # 获取图片的题注,其位置一般在 img 父标签的下一个兄弟标签
            title = item.parent.next_sibling.get_text()
            images.append(Image(title, link))
        news.set_paragraphs(paragraphs)
        news.set_images(images)
        return news
    except AttributeError as e:
        print(e)
        print('某个标签元素不存在 或者url错误(服务器不存在)导致html.read()出错')
        return news


if __name__ == '__main__':
    for element in get_news_from_hzu():
        print(element.print_news_content())

1

2021年4月30日 测试正常