安装依赖
pip install jieba scikit-learn prettytable
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from prettytable import PrettyTable, TableStyle
# --- 1. 原始数据与分词 ---
texts = [
"人工智能正在改变世界",
"人工智能正在革新包括医疗和金融在内的各个行业"
]
def jieba_tokenize(text):
return ' '.join(jieba.lcut(text))
tokenized_texts = [jieba_tokenize(t) for t in texts]
# --- 2. 生成词频矩阵 ---
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(tokenized_texts)
vocab = vectorizer.get_feature_names_out()
matrix = X.toarray()
# --- 3. 创建表格并设置表头 ---
# 表头第一列为"文档",其余列来自词汇表
field_names = ["文档"] + list(vocab)
table = PrettyTable()
table.field_names = field_names
# 添加数据行
for i, row in enumerate(matrix, start=1):
table.add_row([f"文本{i}"] + list(row))
# --- 4. 配置表格样式 ---
# 设置所有列内容居中对齐
for field in field_names:
table.align[field] = "c" # "c":居中, "l":左, "r":右
# 可选:调整表格整体样式
table.set_style(TableStyle.MSWORD_FRIENDLY) # 微软Word友好风格的边框(更清爽)
table.border = False # 是否显示边框
# table.horizontal_char = '─' # 可自定义横线字符
# table.vertical_char = '│' # 可自定义竖线字符
# --- 5. 控制列宽(可选,使列宽与标题严格一致)---
# 计算每个中文标题的实际字符宽度(中文占2,英文占1)
def get_str_width(s):
return sum(2 if '\u4e00' <= c <= '\u9fff' else 1 for c in s)
# 为每列设置最小与最大宽度,强制对齐
col_widths = {field: get_str_width(field) for field in field_names}
table._min_width = col_widths
table._max_width = col_widths
# --- 6. 打印表格 ---
print("分词结果:")
for t in tokenized_texts:
print(t)
print("\n词频矩阵(PrettyTable 格式,精确对齐):")
print(table)