import numpy as np from sklearn.ensemble import RandomForestClassifier from typing import Dict, List, Tuple import joblib def prepare_data(data: Dict[str, List[np.ndarray]]) -> Tuple[np.ndarray, np.ndarray]: """ 将dict[str, list[ndarray]]格式的数据转换为模型可用的特征矩阵和标签向量 参数: data: 格式为 dict[str, list[ndarray]] 的数据,其中键为类别名,值为对应类别的特征数组列表 返回: X: 特征矩阵 y: 标签向量 """ features = [] labels = [] # 为每个类别分配一个数字标签 label_map = {class_name: i for i, class_name in enumerate(data.keys())} for class_name, arrays_list in data.items(): label = label_map[class_name] for arr in arrays_list: # 处理每个数组中的每个样本 features.append(np.array(arr)) labels.append(label) # if len(arr.shape) > 1: # for sample in arr: # features.append(sample) # labels.append(label) # else: # # 处理单个样本的情况 # features.append(arr) # labels.append(label) return np.array(features), np.array(labels) def train_model(data: Dict[str, List[np.ndarray]]): """ 训练分类模型 参数: data: 训练数据,格式为 dict[str, list[ndarray]] 返回: 训练好的模型和标签映射字典 """ X, y = prepare_data(data) # 创建并训练模型 model = RandomForestClassifier(n_estimators=100, random_state=42) model.fit(X, y) # 创建逆向映射,用于将数字标签转回类别名 label_map = {i: class_name for i, class_name in enumerate(data.keys())} return model, label_map def predict(model, label_map: Dict[int, str], val_data: Dict[str, List[np.ndarray]]) -> Dict[str, List[List[str]]]: """ 使用训练好的模型对验证数据进行预测 参数: model: 训练好的模型 label_map: 标签映射字典,用于将数字标签转换回类别名 val_data: 验证数据,格式为 dict[str, list[ndarray]] 返回: 预测结果字典,格式为 dict[str, list[list[str]]],表示每个输入数组中样本的预测类别 """ failed = [] suc = 0 cnt = 0 for class_name, arrays_list in val_data.items(): class_predictions = [] for arr in arrays_list: # 确保数据格式正确 arr = np.array(arr) cnt+=1 if len(arr.shape) == 1: arr = arr.reshape(1, -1) # 进行预测并转换为类别名 pred_labels = model.predict(arr) pred_classes = [label_map[label] for label in pred_labels] if len(pred_classes) == 1 and class_name==pred_classes[0]: suc+=1 else: failed.append(arrays_list) # class_predictions.append(pred_classes) # results[class_name] = class_predictions return suc/cnt,failed if __name__ == "__main__": exit() # 训练模型 model, label_map = train_model(d) print("训练完成") joblib.dump(model, "model.pkl") # 在验证数据上进行预测 # predictions = predict(model, label_map, val) # 输出预测结果 # print("预测结果:") # for class_name, class_preds in predictions.items(): # print(f"{class_name}:") # for i, arr_preds in enumerate(class_preds): # print(f" 数组 {i}: {arr_preds}") # 输出模型性能评估 # X_train, y_train = prepare_data(val) # print(f"\n训练集准确率: {model.score(X_train, y_train):.4f}")