word cloud 我之前试着做过:
标题:
posts:
with mask:
标题:
posts:
代码很简单,主要是没有导出的数据库文件
xm_wordcloud.py
# src: https://github.com/leisurelicht/WordCloud-CN/
# required:
# wget https://github.com/leisurelicht/WordCloud-CN/blob/master/stopwords.txt
from pathlib import Path
import jieba
from wordcloud import WordCloud
import sqlite3
import pandas as pd
font_path = '/usr/share/fonts/wenquanyi/wqy-microhei/wqy-microhei.ttc'
stopword_path = 'stopwords.txt'
db = Path('/f/discourse_public_import/xjtu.app.public.dump.2024.04.05.db')
def txt2wc(txt, save_png_name='test.png'):
with open(stopword_path) as f_stop:
f_stop_text = f_stop.read()
f_stop_seg_list = f_stop_text.splitlines()
seg_list = jieba.cut(txt, cut_all=False)
my_word_list = []
for my_word in seg_list:
if len(my_word.strip()) > 1 and not (my_word.strip() in f_stop_seg_list):
my_word_list.append(my_word)
my_word_str = ' '.join(my_word_list)
from pathlib import Path
from PIL import Image
import numpy as np
mask = np.array(Image.open(Path('~/Documents/jdm-mask-large.png').expanduser()))
# use mask = None to generate wordcloud without mask
wc = WordCloud(
font_path=font_path,
background_color="white",
mask=mask,
random_state=42,
width=mask.shape[1]-200,
height=mask.shape[0]-100,
)
wc.generate(my_word_str)
wc.to_file(save_png_name)
con = sqlite3.connect(db)
cur = con.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall()) # [('topics',), ('users',), ('posts',), ('likes',)]
df_topics = pd.read_sql_query("SELECT * FROM topics", con)
df_posts = pd.read_sql_query("SELECT * FROM posts", con)
titles = df_topics['title']
posts = df_posts['raw']
titles = [t for t in titles if not ('关于' in t and '类别' in t)]
posts = [p.replace('<br>', '\n') for p in posts]
txt2wc('\n'.join(list(posts)), 'xmen-posts-wordcloud.png')
txt2wc('\n'.join(list(titles)), 'xmen-titles-wordcloud.png')