import random

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt


from IPython.display import display, Math, Latex


class Strategy:
    
    def __init__(self, n_arms: int):
        self.n_arms = n_arms
        self.n_iters = 0
        self.arms_states = np.zeros(n_arms)
        self.arms_actions = np.zeros(n_arms)
        
    def flush(self):
        self.n_iters = 0
        self.arms_states = np.zeros(self.n_arms)
        self.arms_actions = np.zeros(self.n_arms)
        
    def update_reward(self, arm: int, reward: int):
        self.n_iters += 1
        self.arms_states[arm] += reward
        self.arms_actions[arm] += 1
        
    def choose_arm(self):
        raise NotImplementedError


class EpsGreedy(Strategy):
    
    def __init__(self, n_arms: int, eps: float = 0.1):
        super().__init__(n_arms)
        self.eps = eps
        
    def choose_arm(self):
        if random.random() < self.eps:
            return random.randint(0, self.n_arms - 1) # choose random arm
        else:
            return np.argmax(self.arms_states / self.arms_actions) # choose the best arm


class UCB1(Strategy):
    
    def choose_arm(self):
        if self.n_iters < self.n_arms:
            return self.n_iters
        else:
            return np.argmax(self.ucb())
        
        
    def ucb(self):
        ucb = self.arms_states / self.arms_actions # mean x_j
        ucb += np.sqrt(2 * np.log(self.n_iters) / self.arms_actions) # confidence part
        return ucb


class Thompson(Strategy):
    
    def __init__(self, n_arms: int):
    # your code here
        
    def update_reward(self, arm: int, reward: int):
    # your code here 
        
    def choose_arm(self):
    # your code here
        return arm


class BernoulliEnv:
    
    def __init__(self, arms_proba: list):
        self.arms_proba = arms_proba
        
    @property
    def n_arms(self):
        return len(self.arms_proba)
        
    def pull_arm(self, arm_id: int):
        if random.random() < self.arms_proba[arm_id]:
            return 1
        else:
            return 0


class Bandit:
    
    def __init__(self, env: BernoulliEnv, strategy: Strategy):
        self.env = env
        self.strategy = strategy
        
    def action(self):
        arm = self.strategy.choose_arm()
        reward = self.env.pull_arm(arm)
        self.strategy.update_reward(arm, reward)


def calculate_regret(env: BernoulliEnv, strategy: Strategy, n_iters=2000):
    strategy.flush()
    bandit = Bandit(env, strategy)
    regrets = []
    for i in range(n_iters):
        reward = bandit.strategy.arms_actions.dot(env.arms_proba)
        optimal_reward = np.max(env.arms_proba) * i
        regret = optimal_reward - reward
        regrets.append(regret)
        bandit.action()
        
    return regrets


be = BernoulliEnv([0.3, 0.5, 0.7])
eps_1 = EpsGreedy(be.n_arms, 0.1)
eps_2 = EpsGreedy(be.n_arms, 0.3)
eps_3 = EpsGreedy(be.n_arms, 0.5)
ucb = UCB1(be.n_arms)


# Calculate regrets
eps_regrets = calculate_regret(be, eps_1)
eps_2_regrets = calculate_regret(be, eps_2)
eps_3_regrets = calculate_regret(be, eps_3)
ucb_regrets = calculate_regret(be, ucb)

/Users/iMatvich/anaconda3/envs/main/lib/python3.7/site-packages/ipykernel_launcher.py:11: RuntimeWarning: invalid value encountered in true_divide
  # This is added back by InteractiveShellApp.init_path()


plt.figure(figsize=(12, 8))
plt.plot(eps_regrets, label='eps_1')
plt.plot(eps_2_regrets, label='eps_2')
plt.plot(eps_3_regrets, label='eps_3')
plt.plot(ucb_regrets, label='ucb')
plt.legend()
plt.xlabel('number of iterations')
plt.ylabel('cumulative regret (less is better)')

Text(0, 0.5, 'cumulative regret (less is better)')


!pip install space_bandits

Collecting space_bandits
  Downloading https://files.pythonhosted.org/packages/38/25/1e2b2b5c6692dd334ab1c73cc19a4b160124efc6c54c7a90293e02228b72/space-bandits-0.0.992.tar.gz
Requirement already satisfied: torch in /Users/iMatvich/anaconda3/envs/main/lib/python3.7/site-packages (from space_bandits) (1.3.0)
Requirement already satisfied: numpy in /Users/iMatvich/anaconda3/envs/main/lib/python3.7/site-packages (from space_bandits) (1.16.4)
Requirement already satisfied: scipy in /Users/iMatvich/anaconda3/envs/main/lib/python3.7/site-packages (from space_bandits) (1.3.0)
Requirement already satisfied: pandas in /Users/iMatvich/anaconda3/envs/main/lib/python3.7/site-packages (from space_bandits) (0.24.2)
Requirement already satisfied: cython in /Users/iMatvich/anaconda3/envs/main/lib/python3.7/site-packages (from space_bandits) (0.29.12)
Requirement already satisfied: scikit-learn in /Users/iMatvich/anaconda3/envs/main/lib/python3.7/site-packages (from space_bandits) (0.21.2)
Requirement already satisfied: python-dateutil>=2.5.0 in /Users/iMatvich/anaconda3/envs/main/lib/python3.7/site-packages (from pandas->space_bandits) (2.8.0)
Requirement already satisfied: pytz>=2011k in /Users/iMatvich/anaconda3/envs/main/lib/python3.7/site-packages (from pandas->space_bandits) (2019.1)
Requirement already satisfied: joblib>=0.11 in /Users/iMatvich/anaconda3/envs/main/lib/python3.7/site-packages (from scikit-learn->space_bandits) (0.13.2)
Requirement already satisfied: six>=1.5 in /Users/iMatvich/anaconda3/envs/main/lib/python3.7/site-packages (from python-dateutil>=2.5.0->pandas->space_bandits) (1.12.0)
Building wheels for collected packages: space-bandits
  Building wheel for space-bandits (setup.py) ... done
  Stored in directory: /Users/iMatvich/Library/Caches/pip/wheels/a7/0c/a7/4bae68f8e665b4c68e7cb7ae7fff88269f3551a4b56b598a42
Successfully built space-bandits
Installing collected packages: space-bandits
Successfully installed space-bandits-0.0.992


from space_bandits.toy_problem import get_cust_reward, get_customer, get_rewards


group1 = [get_customer(ctype=1)[1] for x in range(1000)]
group2 = [get_customer(ctype=2)[1] for x in range(1000)]
plt.scatter([x[0] for x in group1], [x[1] for x in group1], label='group1');
plt.scatter([x[0] for x in group2], [x[1] for x in group2], label='group2');
plt.legend();


customers = [get_customer(ctype=1) for x in range(100000)]
rewards = np.concatenate([np.expand_dims(get_rewards(cust), axis=0) for cust in customers])
print('group 1 expected rewards: (100000 samples)', rewards.mean(axis=0))

customers = [get_customer(ctype=2) for x in range(100000)]
rewards = np.concatenate([np.expand_dims(get_rewards(cust), axis=0) for cust in customers])
rewards.mean(axis=0)
print('group 2 expected rewards: (100000 samples)', rewards.mean(axis=0))

group 1 expected rewards: (100000 samples) [8.9978  2.49625 1.958  ]
group 2 expected rewards: (100000 samples) [ 2.4983   2.47425 20.086  ]


customers[:10]

[(2, (53, 61.27884094803353)),
 (2, (38, 50.88285900495397)),
 (2, (40, 34.25962987297904)),
 (2, (41, 36.07741738964068)),
 (2, (46, 46.62970623580589)),
 (2, (36, 24.791053674847035)),
 (2, (41, 21.59056238451975)),
 (2, (45, 48.070796167156416)),
 (2, (46, 49.07765937426515)),
 (2, (50, 29.27442814968552))]


rewards

array([[ 10,   0, 100],
       [  0,  25,   0],
       [  0,  25,   0],
       ...,
       [ 10,   0,   0],
       [ 10,   0,   0],
       [  0,  25,   0]])


from space_bandits import LinearBandits
 
num_actions = 3 # num of available models (promo strategies)
num_features = 2 # num of features per customer

linear_model = LinearBandits(num_actions, num_features, initial_pulls=100)
optimal_choices = [None, 0, 2]

def iterate_model(model, optimal_choices, steps, records=None, plot_frequency=250, avg_length=150):
    """Goes through online learning simulation with model."""
    #these will track values for plotting
    if records is None:
        records = dict()
        records['timesteps'] = []
        records['c_reward'] = []
        records['cumulative_reward'] = 0
        records['m_reward'] = []
        records['maximum_reward'] = 0
        records['regret_record'] = []
        records['avg_regret'] = []
        start = 0
    else:
        start = records['timesteps'][-1]
    for i in range(start, start+steps):
        records['timesteps'].append(i)
        #generate a customer
        cust = get_customer()
        
        #generate customer decisions based on group
        reward_vec = get_rewards(cust)
        
        #prepare features for model
        context = np.array([cust[1]])
        best_choice = optimal_choices[cust[0]]
        
        #get reward for 'best' choice
        mx = reward_vec[best_choice]
        records['maximum_reward'] += mx
        records['m_reward'].append(records['maximum_reward'])
        
        # make and action
        action = model.action(context)
        
        #get reward for the action chosen by model
        reward = reward_vec[action]
        
        #regret is the opportunity cost of not choosing the optimal promotion
        regret = mx - reward
        records['regret_record'].append(regret)
        records['cumulative_reward'] += reward
        records['c_reward'].append(records['cumulative_reward'])
        model.update(context, action, reward)
        
        #plot occasionally
        if i <= avg_length:
            if i < avg_length:
                moving_avg=0
            else:
                moving_avg = np.array(records['regret_record']).mean()
            if i == avg_length:
                records['avg_regret'] = [moving_avg] * avg_length
        else:
            moving_avg = sum(records['regret_record'][-avg_length:])/avg_length
        records['avg_regret'].append(moving_avg)
        if i % plot_frequency == 0 and i > 0:
            c_rewardplt = np.array(records['c_reward'])/max(records['m_reward'])
            m_rewardplt = np.array(records['m_reward'])/max(records['m_reward'])
            regretplt = np.array(records['avg_regret'])/max(records['avg_regret'])
            plt.plot(records['timesteps'], c_rewardplt, label='cumulative reward')
            plt.plot(records['timesteps'], m_rewardplt, label='maximum reward')
            plt.plot(records['timesteps'], regretplt, color='red', label='mean regret')
            plt.title('Normalized Reward & Regret')
            plt.legend()
            plt.show()
    return records
            
records = iterate_model(linear_model, optimal_choices, 401, plot_frequency=400)


# from space_bandits import load_model
# linear_model.save('test_path.pkl')
# linear_model = load_model('test_path.pkl')

#continue training

records = iterate_model(linear_model, optimal_choices, 401, plot_frequency=800, records=records)


def plot_decision_boundary(model, X, Y, h=1, scale=1., parallelize=True, title='decision boundary', thompson=False, classic=False, n_threads=-1, flip_colors=True):
    ftnames = X.columns[0], X.columns[1]
    X = X.values
    #model.fit(X[:, :2], Y)
    x_min = X[:, 1].min() - .5
    x_max = X[:, 1].max() + .5
    y_min = X[:, 0].min() - .5
    y_max = X[:, 0].max() + .5
    xx, yy = np.meshgrid(
        np.arange(x_min, x_max, h),
        np.arange(y_min, y_max, h)
    )
    if classic:
        Z = model.classic_predict(np.c_[xx.ravel(), yy.ravel()]/scale, thompson=thompson)
    else:
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()]/scale, thompson=thompson, parallelize=parallelize)

    # Put the result into a color plot.
    Z = Z.reshape(xx.shape)
    plt.pcolormesh(xx, yy, Z, alpha=.25)

    # Add the training points to the plot.
    if flip_colors:
        Y = np.where(np.array(Y)==1, 0, 1)
    plt.scatter(X[:, 1], X[:, 0], c=Y, alpha=.5);
    #plt.scatter(X[:, 1], X[:, 0], c='black', alpha=.1);


    plt.xlabel(ftnames[1])
    plt.ylabel(ftnames[0])

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max());
    plt.title(title)
    plt.show()
    
    
from time import time
X = group1 + group2
Y = [1 for x in range(1000)] + [0 for x in range(1000)]
ages = [x[0] for x in X]
ARPUs = [x[1] for x in X]
as_df = pd.DataFrame()
as_df['ARPU'] = ARPUs
as_df['age'] = ages
X = as_df



plot_decision_boundary(linear_model, X, Y, h=.5, thompson=False, title='Decision Boundary Without Thompson Sampling')

t1 = time()
plot_decision_boundary(linear_model, X, Y, h=.5, thompson=True, parallelize=True, n_threads=3, title='Decision Boundary With Thompson Sampling')
print('took ', round(time()-t1, 2), ' seconds.')

took  33.64  seconds.


from space_bandits import NeuralBandits

num_actions = 3
num_features = 2
memory_size = 10000

neural_model = NeuralBandits(num_actions, num_features, initial_pulls=100, memory_size=memory_size, layer_sizes=[50, 12])
assert neural_model.data_h.memory_size == memory_size

records = iterate_model(neural_model, optimal_choices, 401, plot_frequency=400)

Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...


records = iterate_model(neural_model, optimal_choices, 401, plot_frequency=800)

Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...
Training neural_model-bnn for 100 steps...


plot_decision_boundary(neural_model, X, Y, h=.5, thompson=False, title='Decision Boundary Without Thompson Sampling', n_threads=3)


t1 = time()
plot_decision_boundary(neural_model, X, Y, h=.5, thompson=True, parallelize=True, title='Decision Boundary With Thompson Sampling', n_threads=3)
print('took ', time() - t1)

took  38.55807590484619


! pip install contextualbandits

Collecting contextualbandits
  Using cached https://files.pythonhosted.org/packages/a0/f9/32d77bfa13ab804e66c71720d5dd4b17bd398d575c6e02d4ce9203c8ca1e/contextualbandits-0.3.12.tar.gz
  Installing build dependencies ... \


import pandas as pd, numpy as np, re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.datasets import load_svmlight_file

def parse_data(filename):
    with open(filename, "rb") as f:
        infoline = f.readline()
        infoline = re.sub(r"^b'", "", str(infoline))
        n_features = int(re.sub(r"^\d+\s(\d+)\s\d+.*$", r"\1", infoline))
        features, labels = load_svmlight_file(f, n_features=n_features, multilabel=True)
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(labels)
    features = np.array(features.todense())
    features = np.ascontiguousarray(features)
    return features, labels

X, y = parse_data("Bibtex_data.txt")
print(X.shape)
print(y.shape)


from sklearn.linear_model import LogisticRegression
from contextualbandits.online import BootstrappedUCB, BootstrappedTS, LogisticUCB, \
            SeparateClassifiers, EpsilonGreedy, AdaptiveGreedy, ExploreFirst, \
            ActiveExplorer, SoftmaxExplorer
from copy import deepcopy

$\varepsilon$-жадная стратегия¶

Contextual bandits¶