首页下载资源后端NLP-机器学习文本分类源代码+数据集

ZIPNLP-机器学习文本分类源代码+数据集

JimanBoss2.36MB需要积分:1

资源文件列表:

data_dev.zip 大约有2个文件
  1. LRmodel.py 6.31KB
  2. sample data.xlsx 2.43MB

资源介绍:

建立基于Logistics Regression算法的文本分类模型,其完整流程包括:数据预处理、特征工程、构建分类器、最优参数选择、模型评估与保存等。
# encoding:utf-8 import pandas as pd import random import time import joblib from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import classification_report, confusion_matrix from sklearn.linear_model import LogisticRegression import re import jieba # 数据预处理:清洗、分词、去停用词 ## def preprocess_text(content,words): print('开始数据预处理...') stopwords = pd.read_csv("C:/Users/xxx/Documents/NLP/data/Stopwords.txt", index_col=False,quoting=3, sep="\t", names=['stopword'], encoding='utf8') stopwords = stopwords['stopword'].values for line in range(len(content)): try: content[line] = content[line].lower() # 删除提及(例如:@zhangsan);删除URL链接;删除标签(例如:#Amazing);删除记号和下一个字符(例如:he's);删除数字;删除2个及以上的空格;删除两端空格 content[line] = re.sub("\S+@\S+|https*\S+|#\S+|\'\w+|\d+", " ", content[line]) content[line] = re.sub(r'[’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’!\[\\\]^_`{|}~]+', ' ',content[line]) # 删除特殊字符 content[line] = re.sub('\s{2,}', " ", content[line]) content[line] = content[line].strip() segs = jieba.lcut(content[line]) segs = filter(lambda x: x != " ", segs) segs = filter(lambda x: x not in stopwords, segs) segs = list(segs) Allsegs = " ".join(segs) words.append(Allsegs) except Exception as e: print ("something go wrong----"+content[line]) continue ## 文本向量化 def Vectorize(sentences): ## 将得到的数据集打散,生成更可靠的训练集分布 ## random.shuffle(sentences) ## 用sk-learn切分数据,分成训练集和测试集 ## x, y = zip(*sentences) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10) ## TF-IDF向量化并保存特征集 vec = TfidfVectorizer(analyzer='word', max_df=0.6, ngram_range=[1,3], max_features=30000) vec.fit(x_train) print('(训练集,特征量) =', vec.transform(x_train).toarray().shape) print('测试集',len(x_test)) return vec,x_train,y_train,x_test,y_test ## 用随机网格搜索法调参,寻找最优参数## def ChoosePara(): pipe_rf = LogisticRegression() param_grid = {'C': [1,10,100,1e3], 'solver':['newton-cg','lbfgs', 'sag'], 'multi_class':['ovr','multinomial'], 'max_iter':[100,600,1000]} gs = RandomizedSearchCV(estimator=pipe_rf, param_distributions=param_grid, scoring='accuracy', cv=10, n_jobs=1) #estimator:选择的机器学习算法, n_jobs:并行数,cv:交叉验证参数,scoring:模型评价标准 gs = gs.fit(vec.transform(x_train), y_train) print('Best score:', gs.best_score_) print('Best param:', gs.best_params_) # 构造逻辑回归分类器 def BuildModel(): print('分类器开始工作...') begin = time.perf_counter() Classifier = LogisticRegression(C=100, penalty='l2', solver='newton-cg', multi_class='ovr', max_iter=100).fit(vec.transform(x_train),y_train) # # penalty:正则化参数, solver:优化算法选择参数, multi_class='multinomial'分类方式选择参数, C:正则化系数的倒数, y_pred = Classifier.predict(vec.transform(x_test)) print("\n逻辑回归评估指标:\n", classification_report(y_test,y_pred)) acc = Classifier.score(vec.transform(x_test), y_test) end_time = time.perf_counter() run_time = end_time - begin print('准确率为:', acc, '建模运行时间', run_time, 's') y_pred_proba = [max(max(Classifier.predict_proba(i))) for i in vec.transform(x_test)] TestResult = pd.DataFrame(confusion_matrix(y_test, y_pred), columns=['Low Risk', 'High Risk', 'MRA'],index=['Itemized', 'Non-Itemized', 'Non-Itemized-MRA']) print('\n',TestResult) # # # #提取测试集预测情况 # # # 将Content_Words中的元素存入字典中,以第三个元素作为键 dict_a = {tup[2]: tup for tup in Content_Words} # 按照x_test的顺序对Content_Words的第1元素进行排序 TestID = [dict_a[b][0] for b in x_test if b in dict_a] # 按照x_test的顺序对Content_Words的第2元素进行排序 TestContent = [dict_a[b][1] for b in x_test if b in dict_a] TestDF = pd.DataFrame({'ImgName':TestID}) TestDF['Content'] = TestContent TestDF['CutWords'] = x_test TestDF['label'] = y_test y_test_pred = [''.join(i) for i in y_pred] TestDF['Test_pred'] = y_test_pred TestDF['Probability'] = y_pred_proba TestDF.to_excel('C:/Users/xxx/Desktop/测试集情况.xlsx', index=False) print('测试集情况导出成功') return vec,Classifier if __name__ == '__main__': ## 导入数据 ## df = pd.read_excel("C:/Users/xxx/Desktop/Data.xlsx") df = df.dropna() # 清除缺失值 # ## 转换为列表 ## content = df.content.values.tolist() label = df.label.values.tolist() words = [] ## 调用预处理函数,并保存分词库 ## preprocess_text(content, words) df['CutWords'] = words df.to_excel("C:/Users/xxx/Desktop/Data-clear.xlsx",index=False) ####### 调用已保存过的cut words, 以免每次测试模型都要重新分词 ## Cut_df = pd.read_excel("C:/Users/xxx/Desktop/Data-clear.xlsx") Cut_df = Cut_df.dropna() ImgName = Cut_df.ImgName.values.tolist() cut_content = Cut_df.content.values.tolist() cut_words = Cut_df.CutWords.values.tolist() label = Cut_df.label.values.tolist() print(pd.value_counts(label)) sentences = list(zip(cut_words, label)) Content_Words = list(zip(ImgName, cut_content, cut_words)) ## 语料库向量化 vec, x_train, y_train, x_test, y_test = Vectorize(sentences) # ## 寻找算法最优参数 # ChoosePara() # ## 调用建模函数 vec,Classifier = BuildModel() ## 保存 model joblib.dump(vec,'C:/Users/xxx/desktop/Features.m') print('特征集保存成功') joblib.dump(Classifier, 'C:/Users/xxx/desktop/Model.m') print('模型保存成功!')
100+评论
captcha