neiro blog

Markov chains in Elixir

· [neiro]

Markov chains

Markov chain or Markov model is a process that undergoes transitions from one state to another. The next state depends only on current state and not the sequence of previous events. This allows us to use Markov chains as statistical models for real-world processes.

Figure 1: Simple Markov chain

Figure 1: Simple Markov chain

For the next example we will try to build simple sentence generator within Markov chain.

Realization

Let’s create an entry point of new application:

 1    # lib/elixir markov chain.ex
 2    defmodule ElixirMarkovChain do
 3      alias ElixirMarkovChain.Model
 4      alias ElixirMarkovChain.Generator
 5
 6      def start(_type, _args) do
 7        case File.read(Application.get_env :elixir_markov_chain, :source_file) do
 8           {:ok, body} -> process_source body
 9           {:error, reason} -> IO.puts reason
10        end
11
12        System.halt 0
13      end
14
15      defp process_source do
16      end
17    end

At first, to process the source file for output sentences, we need to tokenize it:

1    # lib/elixir_markov_chain/tokenizer.ex
2    defmodule ElixirMarkovChain.Tokenizer do
3      def tokenize(text) do
4        text
5          |> String.downcase
6          |> String.split(~r{\n}, trim: true) # split text to sentences
7          |> Enum.map(&String.split/1) # split sentences to words
8      end
9    end

Next we need to realize Markov model. We’ll use agents to share state in application:

 1    # lib/elixir_markov_chain/model.ex
 2    defmodule ElixirMarkovChain.Model do
 3      import ElixirMarkovChain.Tokenizer
 4
 5      def start_link, do: Agent.start_link(fn -> %{} end) # create map for sharing through agent
 6
 7      def populate(pid, text) do
 8        for tokens <- tokenize(text), do: modelize(pid, tokens) # populate model with tokens
 9        pid
10      end
11
12      def fetch_token(state, pid) do
13        tokens = fetch_tokens state, pid
14
15        if length(tokens) > 0 do
16          token = Enum.random tokens
17          count = tokens |> Enum.count(&(token == &1))
18          {token, count / length(tokens)} # count probability of the token
19        else
20          {"", 0.0}
21        end
22      end
23
24      def fetch_state(tokens), do: fetch_state(tokens, length(tokens))
25      defp fetch_state(_tokens, id) when id == 0, do: {nil, nil}
26      defp fetch_state([head | _tail], id) when id == 1, do: {nil, head}
27      defp fetch_state(tokens, id) do
28        tokens
29          |> Enum.slice(id - 2..id - 1) # fetch states by ids
30          |> List.to_tuple
31      end
32
33      # Get tokens within agent
34      defp fetch_tokens(state, pid), do: Agent.get pid, &(&1[state] || [])
35
36      # Build Markov chain model using tokens
37      defp modelize(pid, tokens) do
38        for {token, id} <- Enum.with_index(tokens) do
39          tokens |> fetch_state(id) |> add_state(pid, token)
40        end
41      end
42
43      # Add new state within agent
44      defp add_state(state, pid, token) do
45        Agent.update pid, fn(model) ->
46          current_state = model[state] || []
47          Map.put model, state, [token | current_state]
48        end
49      end
50    end

When our Markov model is done, we can use it in application. For this example, we’ll build a random sentence generator based on text source:

 1    # lib/elixir_markov_chain/generator.ex
 2    defmodule ElixirMarkovChain.Generator do
 3      alias ElixirMarkovChain.Model
 4
 5      def create_sentence(pid) do
 6        {sentence, prob} = build_sentence pid
 7
 8        # Create new sentence or convert builded based on treshold value
 9        if prob >= Application.get_env(:elixir_markov_chain, :treshold) do
10          sentence |> Enum.join(" ") |> String.capitalize
11        else
12          create_sentence pid
13        end
14      end
15
16      # Sentence is complete when it have enough length
17      # or when punctuation ends a sentence
18      defp complete?(tokens) do
19        length(tokens) > 15 ||
20        (length(tokens) > 3 && Regex.match?(~r/[\!\?\.]\z/, List.last tokens))
21      end
22
23      defp build_sentence(pid), do: build_sentence(pid, [], 0.0, 0.0)
24      defp build_sentence(pid, tokens, prob_acc, new_tokens) do
25        # Fetch Markov model state through agent
26        {token, prob} = tokens |> Model.fetch_state |> Model.fetch_token(pid)
27
28        case complete?(tokens) do
29          true ->
30            score = case new_tokens == 0 do
31              true -> 1.0
32              _ -> prob_acc / new_tokens # count new probability for this word
33            end
34            {tokens, score}
35          _ ->
36            # Concat sentence with new token and try to continue
37            build_sentence pid, tokens ++ [token], prob + prob_acc, new_tokens + 1
38        end
39      end
40    end

Now, when basic logic is implemented, we need to fill process_source function:

1    # lib/elixir_markov_chain.ex
2    defp process_source(text) do
3      {:ok, model} = Model.start_link
4      model = Model.populate model, text # populate Markov model with the source
5
6      # Generate 10 random sentences based on text source
7      Enum.each(1..10, fn(_) -> model |> Generator.create_sentence |> IO.puts end)
8    end

Result

Processed from Thus Spoke Zarathustra by Friedrich Nietzsche:

Processed from Metamorphosis by Franz Kafka:

Conclusion

Elixir allows to easily build Markov chains and applicate them to real world processes. In our case we have built the random text generator, but you can find Markov models useful for another cases. To view entire application please visit this repository.

#elixir #functional