# PyTorch import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader
# For data preprocess import numpy as np import csv import os
# For plotting import matplotlib.pyplot as plt from matplotlib.pyplot import figure
myseed = 42069# set a random seed for reproducibility torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(myseed) torch.manual_seed(myseed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(myseed)
导入工具
无需修改
defget_device(): ''' Get device (if GPU is available, use GPU) ''' return'cuda'if torch.cuda.is_available() else'cpu'
import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader
# For data preprocess import numpy as np import csv import os
classCOVID19Dataset(Dataset): ''' Dataset for loading and preprocessing the COVID19 dataset ''' def__init__(self, path, mu, std, mode='train', target_only=False): # mu,std是自己加,baseline代码归一化有问题,重写归一化部分
# 初始化模型类别(训练、测试、验证),默认是train self.mode = mode
# Read data into numpy arrays withopen(path, 'r') as fp: data = list(csv.reader(fp)) # 去除id列 data = np.array(data[1:])[:, 1:].astype(float) ifnot target_only: feats = list(range(93)) else: # TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75) # feats_selected是我们选择特征, 40代表是states特征 feats = list(range(40)) + feats_selected #如果用只用两个特征,可以忽略前面数据分析过程,直接这样写 #feats = list(range(40)) + [57, 75]
if mode == 'test': # Testing data # data: 893 x 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17)) data = data[:, feats] self.data = torch.FloatTensor(data) else: # Training data (train/dev sets) # data: 2700 x 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18)) target = data[:, -1] data = data[:, feats] # Splitting training data into train & dev sets # if mode == 'train': # indices = [i for i in range(len(data)) if i % 10 != 0] # elif mode == 'dev': # indices = [i for i in range(len(data)) if i % 10 == 0]
# baseline代码中,划分训练集和测试集按照顺序选择数据,可能造成数据分布问题,改成随机选择 indices_tr, indices_dev = train_test_split([i for i inrange(data.shape[0])], test_size = 0.3, random_state = 0) if self.mode == 'train': indices = indices_tr elif self.mode == 'dev': indices = indices_dev # Convert data into PyTorch tensors self.data = torch.FloatTensor(data[indices]) self.target = torch.FloatTensor(target[indices])
# Normalize features (you may remove this part to see what will happen) # self.data[:, 40:] = \ # (self.data[:, 40:] - self.data[:, 40:].mean(dim=0, keepdim=True)) \ # / self.data[:, 40:].std(dim=0, keepdim=True) # baseline这段代码数据归一化用的是当前数据归一化,事实上验证集上和测试集上归一化一般只能用过去数据即训练集上均值和方差进行归一化 # self.dim = self.data.shape[1]
# print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})' # .format(mode, len(self.data), self.dim))
print('Finished reading the {} set of COVID19 Dataset ({} samples found, each dim = {})' .format(mode, len(self.data), self.dim))
def__getitem__(self, index): # Returns one sample at a time if self.mode in ['train', 'dev']: # For training return self.data[index], self.target[index] else: # For testing (no target) return self.data[index]
def__len__(self): # Returns the size of the dataset returnlen(self.data)
defprep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False, mu=None, std=None):#训练集不需要传mu,std, 所以默认值设置为None ''' Generates a dataset, then is put into a dataloader. ''' dataset = COVID19Dataset(path, mu, std, mode=mode, target_only=target_only) # Construct dataset # 如果是训练集,把训练集上均值和方差保存下来 if mode == 'train': mu = dataset.mu std = dataset.std dataloader = DataLoader( dataset, batch_size, shuffle=(mode == 'train'), drop_last=False, num_workers=n_jobs, pin_memory=True) # Construct dataloader return dataloader, mu, std
classNeuralNet(nn.Module): ''' A simple fully-connected deep neural network ''' def__init__(self, input_dim): super(NeuralNet, self).__init__()
# Define your neural network here # TODO: How to modify this model to achieve better performance? # 70是我调得最好的, 而且加层很容易过拟和 self.net = nn.Sequential( nn.Linear(input_dim, 68), nn.ReLU(), nn.Linear(68,1) ) # Mean squared error loss self.criterion = nn.MSELoss(reduction='mean')
defforward(self, x): ''' Given input of size (batch_size x input_dim), compute output of the network ''' return self.net(x).squeeze(1)
defcal_loss(self, pred, target): ''' Calculate loss ''' # TODO: you may implement L2 regularization here eps = 1e-6 l2_reg = 0 alpha = 0.0001 # 这段代码是l2正则,但是实际操作l2正则效果不好,大家也可以调,把下面这段代码取消注释就行 # for name, w in self.net.named_parameters(): # if 'weight' in name: # l2_reg += alpha * torch.norm(w, p = 2).to(device) return torch.sqrt(self.criterion(pred, target) + eps) + l2_reg #lr_reg=0, 后面那段代码用的是均方根误差,均方根误差和kaggle评测指标一致,而且训练模型也更平稳
deftrain(tr_set, dv_set, model, config, device): ''' DNN training '''
n_epochs = config['n_epochs'] # Maximum number of epochs
min_mse = 1000. loss_record = {'train': [], 'dev': []} # for recording training loss early_stop_cnt = 0 epoch = 0 while epoch < n_epochs: model.train() # set model to training mode for x, y in tr_set: # iterate through the dataloader optimizer.zero_grad() # set gradient to zero x, y = x.to(device), y.to(device) # move data to device (cpu/cuda) pred = model(x) # forward pass (compute output) mse_loss = model.cal_loss(pred, y) # compute loss mse_loss.backward() # compute gradient (backpropagation) optimizer.step() # update model with optimizer loss_record['train'].append(mse_loss.detach().cpu().item())
# After each epoch, test your model on the validation (development) set. dev_mse = dev(dv_set, model, device) if dev_mse < min_mse: # Save model if your model improved min_mse = dev_mse print('Saving model (epoch = {:4d}, loss = {:.4f})' .format(epoch + 1, min_mse)) torch.save(model.state_dict(), config['save_path']) # Save model to specified path early_stop_cnt = 0 else: early_stop_cnt += 1
epoch += 1 loss_record['dev'].append(dev_mse) if early_stop_cnt > config['early_stop']: # Stop training if your model stops improving for "config['early_stop']" epochs. break
print('Finished training after {} epochs'.format(epoch)) return min_mse, loss_record
defdev(dv_set, model, device): model.eval() # set model to evalutation mode total_loss = 0 for x, y in dv_set: # iterate through the dataloader x, y = x.to(device), y.to(device) # move data to device (cpu/cuda) with torch.no_grad(): # disable gradient calculation pred = model(x) # forward pass (compute output) mse_loss = model.cal_loss(pred, y) # compute loss total_loss += mse_loss.detach().cpu().item() * len(x) # accumulate loss total_loss = total_loss / len(dv_set.dataset) # compute averaged loss
return total_loss
deftest(tt_set, model, device): model.eval() # set model to evalutation mode preds = [] for x in tt_set: # iterate through the dataloader x = x.to(device) # move data to device (cpu/cuda) with torch.no_grad(): # disable gradient calculation pred = model(x) # forward pass (compute output) preds.append(pred.detach().cpu()) # collect prediction preds = torch.cat(preds, dim=0).numpy() # concatenate all predictions and convert to a numpy array return preds
device = get_device() # get the current available device ('cpu' or 'cuda') os.makedirs('models', exist_ok=True) # The trained model will be saved to ./models/ target_only = True# TODO: Using 40 states & 2 tested_positive features
# TODO: How to tune these hyper-parameters to improve your model's performance? config = { 'n_epochs': 3000, # maximum number of epochs 'batch_size': 270, # mini-batch size for dataloader 'optimizer': 'SGD', # optimization algorithm (optimizer in torch.optim) 'optim_hparas': { # hyper-parameters for the optimizer (depends on which optimizer you are using) 'lr': 0.005, # learning rate of SGD 'momentum': 0.5# momentum for SGD }, 'early_stop': 200, # early stopping epochs (the number epochs since your model's last improvement) 'save_path': 'models/model_select.path'# your model will be saved here }
Finished reading the train set of COVID19 Dataset (1890 samples found, each dim = 54)
Finished reading the dev set of COVID19 Dataset (810 samples found, each dim = 54)
Finished reading the test set of COVID19 Dataset (893 samples found, each dim = 54)
model = NeuralNet(tr_set.dataset.dim).to(device) # Construct model and move to device
Saving model (epoch = 1, loss = 17.9400)
Saving model (epoch = 2, loss = 17.7633)
Saving model (epoch = 3, loss = 17.5787)
Saving model (epoch = 4, loss = 17.3771)
……
Saving model (epoch = 581, loss = 0.9606)
Saving model (epoch = 594, loss = 0.9606)
Saving model (epoch = 598, loss = 0.9606)
Saving model (epoch = 599, loss = 0.9604)
Saving model (epoch = 600, loss = 0.9603)
Saving model (epoch = 621, loss = 0.9602)
Saving model (epoch = 706, loss = 0.9601)
Saving model (epoch = 741, loss = 0.9601)
Saving model (epoch = 781, loss = 0.9598)
Saving model (epoch = 786, loss = 0.9597)
Finished training after 987 epochs
del model model = NeuralNet(tr_set.dataset.dim).to(device) ckpt = torch.load(config['save_path'], map_location='cpu') # Load your best model model.load_state_dict(ckpt) plot_pred(dv_set, model, device) # Show prediction on the validation set
defsave_pred(preds, file): ''' Save predictions to specified file ''' print('Saving results to {}'.format(file)) withopen(file, 'w') as fp: writer = csv.writer(fp) writer.writerow(['id', 'tested_positive']) for i, p inenumerate(preds): writer.writerow([i, p]) preds = test(tt_set, model, device) # predict COVID-19 cases with your model save_pred(preds, 'commit.csv') # save prediction file to pred.csv