119 lines
3.7 KiB
Python
119 lines
3.7 KiB
Python
|
|
import numpy as np
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from typing import Dict, List, Tuple
|
|
import joblib
|
|
|
|
def prepare_data(data: Dict[str, List[np.ndarray]]) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""
|
|
将dict[str, list[ndarray]]格式的数据转换为模型可用的特征矩阵和标签向量
|
|
|
|
参数:
|
|
data: 格式为 dict[str, list[ndarray]] 的数据,其中键为类别名,值为对应类别的特征数组列表
|
|
|
|
返回:
|
|
X: 特征矩阵
|
|
y: 标签向量
|
|
"""
|
|
features = []
|
|
labels = []
|
|
|
|
# 为每个类别分配一个数字标签
|
|
label_map = {class_name: i for i, class_name in enumerate(data.keys())}
|
|
|
|
for class_name, arrays_list in data.items():
|
|
label = label_map[class_name]
|
|
for arr in arrays_list:
|
|
# 处理每个数组中的每个样本
|
|
features.append(np.array(arr))
|
|
labels.append(label)
|
|
# if len(arr.shape) > 1:
|
|
# for sample in arr:
|
|
# features.append(sample)
|
|
# labels.append(label)
|
|
# else:
|
|
# # 处理单个样本的情况
|
|
# features.append(arr)
|
|
# labels.append(label)
|
|
|
|
return np.array(features), np.array(labels)
|
|
|
|
def train_model(data: Dict[str, List[np.ndarray]]):
|
|
"""
|
|
训练分类模型
|
|
|
|
参数:
|
|
data: 训练数据,格式为 dict[str, list[ndarray]]
|
|
|
|
返回:
|
|
训练好的模型和标签映射字典
|
|
"""
|
|
X, y = prepare_data(data)
|
|
|
|
# 创建并训练模型
|
|
model = RandomForestClassifier(n_estimators=100, random_state=42)
|
|
model.fit(X, y)
|
|
|
|
# 创建逆向映射,用于将数字标签转回类别名
|
|
label_map = {i: class_name for i, class_name in enumerate(data.keys())}
|
|
|
|
return model, label_map
|
|
|
|
def predict(model, label_map: Dict[int, str], val_data: Dict[str, List[np.ndarray]]) -> Dict[str, List[List[str]]]:
|
|
"""
|
|
使用训练好的模型对验证数据进行预测
|
|
|
|
参数:
|
|
model: 训练好的模型
|
|
label_map: 标签映射字典,用于将数字标签转换回类别名
|
|
val_data: 验证数据,格式为 dict[str, list[ndarray]]
|
|
|
|
返回:
|
|
预测结果字典,格式为 dict[str, list[list[str]]],表示每个输入数组中样本的预测类别
|
|
"""
|
|
failed = []
|
|
|
|
suc = 0
|
|
cnt = 0
|
|
for class_name, arrays_list in val_data.items():
|
|
class_predictions = []
|
|
for arr in arrays_list:
|
|
# 确保数据格式正确
|
|
arr = np.array(arr)
|
|
cnt+=1
|
|
if len(arr.shape) == 1:
|
|
arr = arr.reshape(1, -1)
|
|
|
|
# 进行预测并转换为类别名
|
|
pred_labels = model.predict(arr)
|
|
pred_classes = [label_map[label] for label in pred_labels]
|
|
if len(pred_classes) == 1 and class_name==pred_classes[0]:
|
|
suc+=1
|
|
else:
|
|
failed.append(arrays_list)
|
|
# class_predictions.append(pred_classes)
|
|
|
|
# results[class_name] = class_predictions
|
|
|
|
return suc/cnt,failed
|
|
|
|
if __name__ == "__main__":
|
|
exit()
|
|
# 训练模型
|
|
model, label_map = train_model(d)
|
|
print("训练完成")
|
|
joblib.dump(model, "model.pkl")
|
|
|
|
# 在验证数据上进行预测
|
|
# predictions = predict(model, label_map, val)
|
|
|
|
# 输出预测结果
|
|
# print("预测结果:")
|
|
# for class_name, class_preds in predictions.items():
|
|
# print(f"{class_name}:")
|
|
# for i, arr_preds in enumerate(class_preds):
|
|
# print(f" 数组 {i}: {arr_preds}")
|
|
|
|
# 输出模型性能评估
|
|
# X_train, y_train = prepare_data(val)
|
|
# print(f"\n训练集准确率: {model.score(X_train, y_train):.4f}") |