やってみよう分析!F#で強化学習(Q-learning, ε-greedy行動選択)

のF#実装版。強化学習自体の解説は上の記事読んどいたらいい。めんどいのでとりあえずε-greedyのみ。計算結果は

seq
  [[(1, 1), 4.99999999999999]; [(1, 2), 20]; [(2, 1), 9.99999999999999];
   [(2, 2), 4.99984834210433]]

なんで、他2つとあってる。途中で力尽きて雑になったコードを直したい。。。そして、俺はF#にbreakがないって初めて知った。再帰で書けってことか。

open System
open System.Collections.Generic
open Microsoft.FSharp.Collections
//Adhoc...
type State = int
type Action = int
type QTable = Dictionary<State*Action, float>
//General Code
let filteredQValuesArray (state: State) (qTable: QTable) = 
    qTable|> Seq.filter (fun (KeyValue(k,v)) -> fst k = state) |> Seq.map (fun (KeyValue(k, v)) -> (snd k, v)) |> Seq.toArray 

let selectAction (random: Random) (state: State) (qTable: QTable)  = 
    let qValuesAtState = filteredQValuesArray state qTable
    let qValues = qValuesAtState |> Array.map snd
    let maxQValue = qValues |> Array.max
    let maxIndex = [|0..(Array.length qValuesAtState - 1)|] |> Array.filter (fun i -> qValues.[i]=maxQValue)
    let size = Array.length maxIndex
    let selectedIndex = if size = 1 then maxIndex.[0] else random.Next(size)
    qValuesAtState.[selectedIndex] |> fst

let epsilonGreedy (random: Random) epsilon (state: State) (qTable: QTable) =       
    if random.NextDouble() < epsilon then 
        let qValuesAtState = filteredQValuesArray state qTable
        let index = random.Next(Array.length qValuesAtState)
        qValuesAtState.[index] |> fst
    else
        selectAction random state qTable
//Adhoc...
let nextState (state: State) (action: Action) : State = 
    if int action = 1 then
        if int state = 1 then
            2
        else
            1
    else
        state

let calcReward (state: State) (action: Action) net = 
    if int action <> 1 && int state = 1 then
        net
    else 
        0.0
let qLearning() = 
    let random = new Random()
    let numA = 2
    let numS = 2
    let alpha = 0.3 
    let gamma = 0.5
    let epsilon = 0.1
    let trialMax = 10000
    let stepMax = 10
    let net = 10.0
    let mutable state : State = 1
    let qTable = new QTable()
    for s in [1..numS] do
        for a in [1..numA] do
            qTable.[(s, a)] <- 0.0
    
    for i in [1..trialMax] do 
        let j = ref 1
        let mutable loopFlag = true
        state <- 1
        while loopFlag && !j < stepMax do
            let action = epsilonGreedy random epsilon state qTable
            let stateNext = nextState state action
            let reward = calcReward state action net
            let qMax = qTable |> Seq.filter (fun (KeyValue(k,v)) -> fst k = stateNext) |> Seq.map (fun (KeyValue(k,v)) -> v) |> Seq.max
            qTable.[(state, action)] <- (1.0 - alpha)*qTable.[(state, action)] + alpha*(reward + gamma*qMax)
            if reward > 0.0 then
                loopFlag <- false
            else
                state <- stateNext
            incr j
    qTable

[<EntryPoint>]
let main argv = 
    qLearning() |> printfn "%A" 
    0