从Verilog到HLS:FPGA实现CNN的并行计算架构与设计权衡
2026/4/13 19:50:20
importnumpyasnpfromnumpy.maimportnegativefromsklearn.model_selectionimporttrain_test_split,cross_val_scorefromsklearn.preprocessingimportStandardScalerimportpandasaspdimportmatplotlib.pyplotaspltfrompylabimportmplfromsklearn.linear_modelimportLogisticRegressionfromsklearnimportmetricsfromimblearn.over_samplingimportSMOTEimporttime# 读取数据并标准化data=pd.read_csv(r"creditcard.csv")scaler=StandardScaler()data["Amount"]=scaler.fit_transform(data[["Amount"]])data=data.drop(["Time"],axis=1)# 设置中文字体mpl.rcParams["font.sans-serif"]=["Microsoft YaHei"]mpl.rcParams["axes.unicode_minus"]=False# 可视化正负样本分布labels_count=pd.value_counts(data["Class"])print(labels_count)plt.title("正负例样本数")plt.xlabel("类别")plt.ylabel("频数")labels_count.plot(kind='bar')plt.show()# 创建训练数据副本data_train=data.copy()# 分离正负样本positive_eg=data_train[data_train["Class"]==0]negative_eg=data_train[data_train["Class"]==1]# 下采样:从多数类中随机抽取与少数类相同数量的样本positive_eg=positive_eg.sample(len(negative_eg))# 合并平衡后的数据集data_c=pd.concat([positive_eg,negative_eg])# 准备特征和标签column_names=['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V18','V19','V20','V21','V22','V23','V24','V25','V26','V27','V28','Amount']x_whole=data_c[column_names]y_whole=data_c[["Class"]]# 划分训练集和测试集x_train_w,x_test_w,y_train_w,y_test_w=train_test_split(x_whole,y_whole,train_size=0.3,random_state=1000)# 使用逻辑回归模型lr=LogisticRegression(C=0.01)lr.fit(x_train_w,y_train_w)# 预测和评估test_predicted=lr.predict(x_test_w)result=lr.score(x_test_w,y_test_w)print(metrics.classification_report(y_test_w,test_predicted))# 使用完整不平衡数据集x_whole=data[column_names]y_whole=data[["Class"]]# 划分训练集和测试集x_train,x_test,y_train,y_test=train_test_split(x_whole,y_whole,train_size=0.2,random_state=1000)# 应用SMOTE过采样oversampler=SMOTE(random_state=0)os_x_train,os_y_train=oversampler.fit_resample(x_train,y_train)scores=[]c_param_range=[0.01,0.1,1,10,100]z=1foriinc_param_range:start_time=time.time()lr=LogisticRegression(C=i,penalty="l2",solver="lbfgs",max_iter=1000)score=cross_val_score(lr,os_x_train,os_y_train,cv=8,scoring="recall")score_mean=sum(score)/len(score)scores.append(score_mean)end_time=time.time()print("第{}次...".format(z))print("time spend:{:.2f}".format(end_time-start_time))print("recall:{}".format(score_mean))z+=1best_c=c_param_range[np.argmax(scores)]print(f"........最优惩罚因子为:{best_c}........")defcm_plot(y,yp):fromsklearn.metricsimportconfusion_matriximportmatplotlib.pyplotasplt cm=confusion_matrix(y,yp)plt.matshow(cm,cmap=plt.cm.Blues)plt.colorbar()forxinrange(len(cm)):foryinrange(len(cm)):plt.annotate(cm[x,y],xy=(y,x),horizontalalignment='center',verticalalignment='center')plt.ylabel('True label')plt.xlabel('Predicted label')returnplt# 使用最优参数训练模型lr=LogisticRegression(C=best_c,penalty="l2",max_iter=1000)lr.fit(os_x_train,os_y_train)# 训练集预测和评估train_predicted=lr.predict(os_x_train)print(metrics.classification_report(os_y_train,train_predicted,digits=6))cm_plot(os_y_train,train_predicted).show()# 测试集预测和评估test_predicted=lr.predict(x_test)print(metrics.classification_report(y_test,test_predicted,digits=6))cm_plot(y_test,test_predicted).show()下采样
SMOTE过采样
在实际应用中,建议根据具体问题和数据特性选择合适的采样技术。同时,通过合理的数据预处理和采样技术,我们可以显著提高模型在不平衡数据集上的性能,特别是在召回率这一关键指标上。