鸟语天空

Python 文本数据增强

post by:追风剑情 2026-4-24 15:58

文本数据增强(Text Data Augmentation)指通过对原始文本进行有策略的变换（如同义词替换、随机删除、回译等），生成更多样化的训练样本，目的是：

扩充有限的训练数据集；
提高模型的泛化能力；
缓解过拟合。

一、安装依赖包

jieba是一个中文分词库
pip install jieba

示例

import random
import jieba
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage

# ========== 1. 初始化本地 Ollama DeepSeek 模型 ==========
llm = ChatOllama(base_url="http://localhost:11434", model="deepseek-r1:7b")

# ========== 2. 中文停用词表（包含标点符号） ==========
stopwords_cn = set([
    '的', '了', '是', '我', '你', '他', '她', '它', '我们', '你们', '他们',
    '和', '与', '或', '但', '也', '就', '都', '不', '在', '有', '这个', '那个',
    '一个', '一些', '这种', '那些', '然后', '所以', '因为', '如果', '虽然',
    # 添加常见标点
    '。', '，', '、', '！', '？', '；', '：', '“', '”', '‘', '’', '（', '）', '【', '】', '《', '》'
])

# ========== 3. 中文同义词字典 ==========
synonyms_dict = {
    "自然语言": ["自然语言处理", "自然语言技术", "语言处理"],
    "人工智能": ["AI", "人工智慧"],
    "领域": ["方向", "范畴", "分支"],
    "子领域": ["分支", "子方向"],
    "处理": ["加工", "分析"],
    "技术": ["方法", "手段"],
    "机器": ["计算机"],
    "学习": ["训练", "学习过程"],
    "深度": ["深层"],
    "神经网络": ["神经网路", "类神经网络"],
}

def get_synonyms_cn(word):
    """返回一个中文词的同义词列表，如果没有则返回空列表"""
    return synonyms_dict.get(word, [])

# ========== 4. 同义词替换函数 ==========
def synonym_replacement_cn(words_list, n):
    new_words = words_list.copy()
    # 候选词：不在停用词表中的词
    candidates = [w for w in words_list if w not in stopwords_cn]
    if not candidates:
        return words_list
    random.shuffle(candidates)
    replaced = 0
    for target in candidates:
        syns = get_synonyms_cn(target)
        if syns:
            synonym = random.choice(syns)
            # 替换所有出现的位置
            new_words = [synonym if w == target else w for w in new_words]
            replaced += 1
            if replaced >= n:
                break
    return new_words

# ========== 5. 随机删除函数（增加空列表保护） ==========
def random_deletion_cn(words_list, p):
    if len(words_list) <= 1:
        return words_list.copy()
    new_words = [w for w in words_list if random.uniform(0, 1) > p]
    if not new_words:
        # 避免返回空列表，至少保留第一个词
        return words_list.copy()
    return new_words

# ========== 6. 原始中文测试语句 ==========
sentence = "自然语言处理是人工智能的一个子领域。"
# 手动添加词汇到 jieba 词典
jieba.add_word("子领域")

words = jieba.lcut(sentence)
print("原始分词结果:", words)

# ========== 7. 规则增强 ==========
augmented_words = synonym_replacement_cn(words, n=2)
augmented_sentence = ''.join(augmented_words)
print("同义词替换后:", augmented_sentence)

deleted_words = random_deletion_cn(words, p=0.2)
deleted_sentence = ''.join(deleted_words)
print("随机删除后:", deleted_sentence)

# ========== 8. 使用本地 LLM 生成增强文本 ==========
prompt = f"请对以下中文文本进行改写或扩充，生成一个意思相近但表达不同的增强版本，只输出增强后的句子，不要包含任何额外解释或思考过程：\n{ sentence }"
try:
    response = llm.invoke([HumanMessage(content=prompt)])
    print("LLM 增强结果:", response.content)
except Exception as e:
    print(f"LLM 调用失败: {e}")

运行测试

评论：

发表评论：