在这里插入图片描述

一、项目介绍

随着互联网的快速发展,微博作为一款受欢迎的社交媒体平台,拥有大量的博文和用户评论等信息。为了更好地利用做好舆情检测和预警工作,开发了一个基于深度学习的网络热点舆情分析系统。该系统利用python语言、mysql数据库等大数据技术,对海量博文热搜信息、评论数据进行处理和分析,通过与情感分析模型进行对比,准确识别出博主的积极言论与消极言论的比值,并生成对应的舆情预计预测报告。
总的来说系统是一个可视化显示web界面,首先通过爬虫技术获取到对应的博文、热搜和评论等数据,考虑数据量较大,系统首先通过pandas进行文本的保存和读取,进而保存到mysql数据库管理系统中,最后通过Django框架结合vue框架进行界面展示。本研究的意义在于,通过对微博平台上的博文和用户评论信息等进行情感分析,提出舆情预警,规避微博用户发布一些不合法的信息。
在这里插入图片描述
在这里插入图片描述

二、文档介绍

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

三、运行截图


在这里插入图片描述在这里插入图片描述

name = 'ReSouSpiders'
    allowed_domains = ['weibo.com']
    url = 'https://s.weibo.com/top/summary?cate=realtimehot'
    # 搜索链接
    search_url = 'https://s.weibo.com/weibo?q={}'
    # TODO 打开cmd,输入: start chrome  --flag-switches-begin --flag-switches-end --remote-debugging-port=9887
   def start_requests(self):
        yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
        # 获取当天的日期,格式年/月/日
        time = datetime.datetime.now().strftime('%Y-%m-%d')
        # 定位到 id="pl_top_realtimehot" 的div标签
        div = response.xpath('//div[@id="pl_top_realtimehot"]')
        # 定位到div标签下的所有的tr标签
        trs = div.xpath('./table/tbody/tr')
        for tr in trs[1:]:
            # 获取热搜的排名,第一个td标签
            rank = tr.xpath('./td[1]/text()').extract_first()
            # 获取热搜的内容
            content = tr.xpath('./td[@class="td-02"]/a/text()').extract_first()
            # 获取热搜的链接
            link = 'https://s.weibo.com'+tr.xpath('./td[@class="td-02"]/a/@href').extract_first()
            # 获取热搜的热度
            hot = tr.xpath('./td[@class="td-02"]/span/text()').extract_first()
            # 获取热搜的标签
            tags = tr.xpath('./td[@class="td-03"]/i/text()').extract()
            item = ReSouItem()
            item['time'] = time
            item['rank'] = rank
            item['content'] = content
            item['link'] = link
            item['hot'] = hot
            item['tags'] = tags
            yield item
            yield scrapy.Request(url=link, callback=self.parse_detail, meta={'item': item,'page': 1})
        pass
import pandas as pd
import os
import pymysql
from sqlalchemy import create_engine


mysql_config = {
    'host': 'localhost',
    'user': 'root',        # 改为正确的用户名以及密码
    'password': 'root',
    'database': 'hz_project_webo',                      
    'port': 3306
}


conn = pymysql.connect(**mysql_config)
cursor = conn.cursor()


engine = create_engine(f"mysql+pymysql://{mysql_config['user']}:{mysql_config['password']}@{mysql_config['host']}:{mysql_config['port']}/{mysql_config['database']}")


create_tables_sql = """
CREATE TABLE IF NOT EXISTS topic_count (
    id INT AUTO_INCREMENT PRIMARY KEY,
    topic VARCHAR(255),
    count INT
);

CREATE TABLE IF NOT EXISTS nickname_count (
    id INT AUTO_INCREMENT PRIMARY KEY,
    nickname VARCHAR(255),
    count INT
);

CREATE TABLE IF NOT EXISTS topic_sum (
    id INT AUTO_INCREMENT PRIMARY KEY,
    topic VARCHAR(255),
    share INT,
    comment INT,
    `like` INT
);

CREATE TABLE IF NOT EXISTS topic_avg (
    id INT AUTO_INCREMENT PRIMARY KEY,
    topic VARCHAR(255),
    share FLOAT,
    comment FLOAT,
    `like` FLOAT
);

CREATE TABLE IF NOT EXISTS topic_max (
    id INT AUTO_INCREMENT PRIMARY KEY,
    topic VARCHAR(255),
    share INT,
    comment INT,
    `like` INT
);

CREATE TABLE IF NOT EXISTS topic_min (
    id INT AUTO_INCREMENT PRIMARY KEY,
    topic VARCHAR(255),
    share INT,
    comment INT,
    `like` INT
);

CREATE TABLE IF NOT EXISTS weibo (
    id INT AUTO_INCREMENT PRIMARY KEY,
    topic VARCHAR(255),
    nickname VARCHAR(255),
    content TEXT,
    share INT,
    comment INT,
    `like` INT,
    tag VARCHAR(255),
    score FLOAT
);

CREATE TABLE IF NOT EXISTS reshou (
    id INT AUTO_INCREMENT PRIMARY KEY,
    time DATETIME,
    `rank` INT,
    content TEXT,
    link VARCHAR(255),
    hot INT,
    tags VARCHAR(255),
    bili FLOAT
);
"""


with conn.cursor() as cursor:
    for statement in create_tables_sql.split(';'):
        if statement.strip():
            cursor.execute(statement)
    conn.commit()


weibo = pd.read_csv('./dataset/weibo.csv')
reshou = pd.read_csv('./dataset/reshou.csv')
import re

# 去掉中文
def remove_chinese(text):
    rex = re.compile(r'[\u4e00-\u9fa5]')
    return rex.sub('', text)

# 去掉特殊字符,空格,单引号,双引号,中括号,大括号,小括号
def remove_special_character(text):
    rex = re.compile(r'[\s\'\"\[\]\{\}\(\)\【\】\n\r\t]')
    return rex.sub('', text)

# 去掉假的转义字符
def remove_escape_character(text):
    rex = text.replace(r'\n', '').replace('\\u3000', '')
    rex = rex.replace(r'\u200b', '')
    return rex
# 处理tags
reshou['tags'] = reshou['tags'].apply(remove_special_character)
# 处理hot
reshou['hot'] = reshou['hot'].apply(remove_special_character)
reshou['hot'] = reshou['hot'].apply(remove_chinese)




# 处理content
weibo['content'] = weibo['content'].apply(remove_special_character)
weibo['content'] = weibo['content'].apply(remove_escape_character)
# 将英文标点转换为中文标点
weibo['content'] = weibo['content'].str.replace(r'!', '!')
weibo['content'] = weibo['content'].str.replace(r'?', '?')
weibo['content'] = weibo['content'].str.replace(r',', ',')
weibo['content'] = weibo['content'].str.replace(r';', ';')
weibo['content'] = weibo['content'].str.replace(r':', ':')
weibo['content'] = weibo['content'].str.replace(r'~', '~')
weibo['content'] = weibo['content'].str.replace(r'`', '·')
# 替换所有的空值为0
weibo = weibo.fillna(0)
# 保存数据

weibo['share'] = weibo['share'].astype(int)
weibo['comment'] = weibo['comment'].astype(int)
weibo['like'] = weibo['like'].astype(int)

weibo['tag'] = None
weibo['score'] = None

reshou['bili'] = None


topic_count = weibo['topic'].value_counts().reset_index()
topic_count.columns = ['topic', 'count']


nickname_count = weibo['nickname'].value_counts().reset_index()
nickname_count.columns = ['nickname', 'count']


topic_sum = weibo.groupby('topic').agg({
    'share': 'sum',
    'comment': 'sum',
    'like': 'sum'
}).reset_index()


topic_avg = weibo.groupby('topic').agg({
    'share': 'mean',
    'comment': 'mean',
    'like': 'mean'
}).reset_index()


topic_max = weibo.groupby('topic').agg({
    'share': 'max',
    'comment': 'max',
    'like': 'max'
}).reset_index()


topic_min = weibo.groupby('topic').agg({
    'share': 'min',
    'comment': 'min',
    'like': 'min'
}).reset_index()


# 只为原始数据表添加id列
weibo = weibo.reset_index(drop=True)
weibo.insert(0, 'id', range(1, len(weibo) + 1))

reshou = reshou.reset_index(drop=True)
reshou.insert(0, 'id', range(1, len(reshou) + 1))


try:
    # 统计表使用数据库自动生成的id
    topic_count.to_sql('topic_count', engine, if_exists='replace', index=True, index_label='id')
    print("已写入topic_count表")

    nickname_count.to_sql('nickname_count', engine, if_exists='replace', index=True, index_label='id')
    print("已写入nickname_count表")

    topic_sum.to_sql('topic_sum', engine, if_exists='replace', index=True, index_label='id')
    print("已写入topic_sum表")

    topic_avg.to_sql('topic_avg', engine, if_exists='replace', index=True, index_label='id')
    print("已写入topic_avg表")

    topic_max.to_sql('topic_max', engine, if_exists='replace', index=True, index_label='id')
    print("已写入topic_max表")

    topic_min.to_sql('topic_min', engine, if_exists='replace', index=True, index_label='id')
    print("已写入topic_min表")

    # 原始数据表保留手动添加的id列
    weibo.to_sql('weibo', engine, if_exists='replace', index=False)
    print("已写入weibo表")

    reshou.to_sql('reshou', engine, if_exists='replace', index=False)
    print("已写入reshou表")

except Exception as e:
    print(f"写入数据库时发生错误: {str(e)}")

finally:
    conn.close()
    print("数据库连接已关闭")

更多推荐