try:
from malmo import MalmoPython
except:
import MalmoPython
import os
import sys
import time
import json
import random
from tqdm import tqdm
from collections import deque
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import randint
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# Hyperparameters
SIZE = 50
REWARD_DENSITY = .1
PENALTY_DENSITY = .02
OBS_SIZE = 5
MAX_EPISODE_STEPS = 100
MAX_GLOBAL_STEPS = 10000
REPLAY_BUFFER_SIZE = 10000
EPSILON_DECAY = .999
MIN_EPSILON = .1
BATCH_SIZE = 128
GAMMA = .9
TARGET_UPDATE = 100
LEARNING_RATE = 1e-4
START_TRAINING = 500
LEARN_FREQUENCY = 1
ACTION_DICT = {
0: 'move 1', # Move one block forward
1: 'turn 1', # Turn 90 degrees to the right
2: 'turn -1', # Turn 90 degrees to the left
3: 'attack 1' # Destroy block
}
# Q-Value Network
class QNetwork(nn.Module):
#------------------------------------
#
# TODO: Modify network architecture
#
#-------------------------------------
def __init__(self, obs_size, action_size, hidden_size=100):
super().__init__()
self.net = nn.Sequential(nn.Linear(np.prod(obs_size), hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, action_size))
def forward(self, obs):
"""
Estimate q-values given obs
Args:
obs (tensor): current obs, size (batch x obs_size)
Returns:
q-values (tensor): estimated q-values, size (batch x action_size)
"""
batch_size = obs.shape[0]
obs_flat = obs.view(batch_size, -1)
return self.net(obs_flat)
def GetMissionXML():
#------------------------------------
#
# TODO: Spawn diamonds
# TODO: Spawn lava
# TODO: Add diamond reward
# TODO: Add lava negative reward
#
#-------------------------------------
return '''
Diamond Collector
clear
''' + \
"".format(-SIZE, SIZE, -SIZE, SIZE) + \
"".format(-SIZE, SIZE, -SIZE, SIZE) + \
'''
CS175DiamondCollector
'''
def get_action(obs, q_network, epsilon, allow_break_action):
"""
Select action according to e-greedy policy
Args:
obs (np-array): current observation, size (obs_size)
q_network (QNetwork): Q-Network
epsilon (float): probability of choosing a random action
Returns:
action (int): chosen action [0, action_size)
"""
#------------------------------------
#
# TODO: Implement e-greedy policy
#
#-------------------------------------
# Prevent computation graph from being calculated
with torch.no_grad():
# Calculate Q-values fot each action
obs_torch = torch.tensor(obs.copy(), dtype=torch.float).unsqueeze(0)
action_values = q_network(obs_torch)
# Remove attack/mine from possible actions if not facing a diamond
if not allow_break_action:
action_values[0, 3] = -float('inf')
# Select action with highest Q-value
action_idx = torch.argmax(action_values).item()
return action_idx
def init_malmo(agent_host):
"""
Initialize new malmo mission.
"""
my_mission = MalmoPython.MissionSpec(GetMissionXML(), True)
my_mission_record = MalmoPython.MissionRecordSpec()
my_mission.requestVideo(800, 500)
my_mission.setViewpoint(1)
max_retries = 3
my_clients = MalmoPython.ClientPool()
my_clients.add(MalmoPython.ClientInfo('127.0.0.1', 10000)) # add Minecraft machines here as available
for retry in range(max_retries):
try:
agent_host.startMission( my_mission, my_clients, my_mission_record, 0, "DiamondCollector" )
break
except RuntimeError as e:
if retry == max_retries - 1:
print("Error starting mission:", e)
exit(1)
else:
time.sleep(2)
return agent_host
def get_observation(world_state):
"""
Use the agent observation API to get a 2 x 5 x 5 grid around the agent.
The agent is in the center square facing up.
Args
world_state: