やってみよう分析!F#で強化学習(Q-learning, ε-greedy行動選択)
- やってみよう分析!Rで強化学習(Q-learning, ε-greedy行動選択) - My Life as a Mock Quant
- やってみよう分析!おまけ 2 - 1: Excel VBAで強化学習(Q-learning, ε-greedy / softmax 行動選択)
のF#実装版。強化学習自体の解説は上の記事読んどいたらいい。めんどいのでとりあえずε-greedyのみ。計算結果は
seq [[(1, 1), 4.99999999999999]; [(1, 2), 20]; [(2, 1), 9.99999999999999]; [(2, 2), 4.99984834210433]]
なんで、他2つとあってる。途中で力尽きて雑になったコードを直したい。。。そして、俺はF#にbreakがないって初めて知った。再帰で書けってことか。
open System open System.Collections.Generic open Microsoft.FSharp.Collections //Adhoc... type State = int type Action = int type QTable = Dictionary<State*Action, float> //General Code let filteredQValuesArray (state: State) (qTable: QTable) = qTable|> Seq.filter (fun (KeyValue(k,v)) -> fst k = state) |> Seq.map (fun (KeyValue(k, v)) -> (snd k, v)) |> Seq.toArray let selectAction (random: Random) (state: State) (qTable: QTable) = let qValuesAtState = filteredQValuesArray state qTable let qValues = qValuesAtState |> Array.map snd let maxQValue = qValues |> Array.max let maxIndex = [|0..(Array.length qValuesAtState - 1)|] |> Array.filter (fun i -> qValues.[i]=maxQValue) let size = Array.length maxIndex let selectedIndex = if size = 1 then maxIndex.[0] else random.Next(size) qValuesAtState.[selectedIndex] |> fst let epsilonGreedy (random: Random) epsilon (state: State) (qTable: QTable) = if random.NextDouble() < epsilon then let qValuesAtState = filteredQValuesArray state qTable let index = random.Next(Array.length qValuesAtState) qValuesAtState.[index] |> fst else selectAction random state qTable //Adhoc... let nextState (state: State) (action: Action) : State = if int action = 1 then if int state = 1 then 2 else 1 else state let calcReward (state: State) (action: Action) net = if int action <> 1 && int state = 1 then net else 0.0 let qLearning() = let random = new Random() let numA = 2 let numS = 2 let alpha = 0.3 let gamma = 0.5 let epsilon = 0.1 let trialMax = 10000 let stepMax = 10 let net = 10.0 let mutable state : State = 1 let qTable = new QTable() for s in [1..numS] do for a in [1..numA] do qTable.[(s, a)] <- 0.0 for i in [1..trialMax] do let j = ref 1 let mutable loopFlag = true state <- 1 while loopFlag && !j < stepMax do let action = epsilonGreedy random epsilon state qTable let stateNext = nextState state action let reward = calcReward state action net let qMax = qTable |> Seq.filter (fun (KeyValue(k,v)) -> fst k = stateNext) |> Seq.map (fun (KeyValue(k,v)) -> v) |> Seq.max qTable.[(state, action)] <- (1.0 - alpha)*qTable.[(state, action)] + alpha*(reward + gamma*qMax) if reward > 0.0 then loopFlag <- false else state <- stateNext incr j qTable [<EntryPoint>] let main argv = qLearning() |> printfn "%A" 0