try: from malmo import MalmoPython except: import MalmoPython import os import sys import time import json import random from tqdm import tqdm from collections import deque import matplotlib.pyplot as plt import numpy as np from numpy.random import randint import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader # Hyperparameters SIZE = 50 REWARD_DENSITY = .1 PENALTY_DENSITY = .02 OBS_SIZE = 5 MAX_EPISODE_STEPS = 100 MAX_GLOBAL_STEPS = 10000 REPLAY_BUFFER_SIZE = 10000 EPSILON_DECAY = .999 MIN_EPSILON = .1 BATCH_SIZE = 128 GAMMA = .9 TARGET_UPDATE = 100 LEARNING_RATE = 1e-4 START_TRAINING = 500 LEARN_FREQUENCY = 1 ACTION_DICT = { 0: 'move 1', # Move one block forward 1: 'turn 1', # Turn 90 degrees to the right 2: 'turn -1', # Turn 90 degrees to the left 3: 'attack 1' # Destroy block } # Q-Value Network class QNetwork(nn.Module): #------------------------------------ # # TODO: Modify network architecture # #------------------------------------- def __init__(self, obs_size, action_size, hidden_size=100): super().__init__() self.net = nn.Sequential(nn.Linear(np.prod(obs_size), hidden_size), nn.ReLU(), nn.Linear(hidden_size, action_size)) def forward(self, obs): """ Estimate q-values given obs Args: obs (tensor): current obs, size (batch x obs_size) Returns: q-values (tensor): estimated q-values, size (batch x action_size) """ batch_size = obs.shape[0] obs_flat = obs.view(batch_size, -1) return self.net(obs_flat) def GetMissionXML(): #------------------------------------ # # TODO: Spawn diamonds # TODO: Spawn lava # TODO: Add diamond reward # TODO: Add lava negative reward # #------------------------------------- return '''

Diamond Collector

12000 true clear ''' + \ "".format(-SIZE, SIZE, -SIZE, SIZE) + \ "".format(-SIZE, SIZE, -SIZE, SIZE) + \ ''' CS175DiamondCollector ''' def get_action(obs, q_network, epsilon, allow_break_action): """ Select action according to e-greedy policy Args: obs (np-array): current observation, size (obs_size) q_network (QNetwork): Q-Network epsilon (float): probability of choosing a random action Returns: action (int): chosen action [0, action_size) """ #------------------------------------ # # TODO: Implement e-greedy policy # #------------------------------------- # Prevent computation graph from being calculated with torch.no_grad(): # Calculate Q-values fot each action obs_torch = torch.tensor(obs.copy(), dtype=torch.float).unsqueeze(0) action_values = q_network(obs_torch) # Remove attack/mine from possible actions if not facing a diamond if not allow_break_action: action_values[0, 3] = -float('inf') # Select action with highest Q-value action_idx = torch.argmax(action_values).item() return action_idx def init_malmo(agent_host): """ Initialize new malmo mission. """ my_mission = MalmoPython.MissionSpec(GetMissionXML(), True) my_mission_record = MalmoPython.MissionRecordSpec() my_mission.requestVideo(800, 500) my_mission.setViewpoint(1) max_retries = 3 my_clients = MalmoPython.ClientPool() my_clients.add(MalmoPython.ClientInfo('127.0.0.1', 10000)) # add Minecraft machines here as available for retry in range(max_retries): try: agent_host.startMission( my_mission, my_clients, my_mission_record, 0, "DiamondCollector" ) break except RuntimeError as e: if retry == max_retries - 1: print("Error starting mission:", e) exit(1) else: time.sleep(2) return agent_host def get_observation(world_state): """ Use the agent observation API to get a 2 x 5 x 5 grid around the agent. The agent is in the center square facing up. Args world_state: