Haskell Snippet, start of bayes probability library

I am adding support for text analysis including bayes classification. The code is based on Toby Segaran's PCIL python source code and contains some of those utility functions.

Libraries used:


import System.Environment
import qualified Data.Map as Map
import qualified Data.Set as Set
import Data.List
import Text.Regex (splitRegex, mkRegex)

Type definitions:


type WordCat = (String, String)
type WordCatInfo = (WordCat, Int)
type WordInfo = (String, Int)

Utilities for finding the word frequency in a document:


--
-- | Find word frequency given an input list using "Data.Map" utilities.
-- With (Map.empty :: Map.Map String Int), set k = String and a = Int
--    Map.empty :: Map k a
-- foldl' is a strict version of foldl = foldl': (a -> b -> a) -> a -> [b] -> a
-- Also see: updmap nm key = Map.insertWith (+) key 1 nm
-- (Original code from John Goerzen's wordFreq)
wordFreq :: [String] -> [WordInfo]
wordFreq inlst = Map.toList $ foldl' updateMap (Map.empty :: Map.Map String Int) inlst
    where updateMap freqmap word = case (Map.lookup word freqmap) of
                                         Nothing -> (Map.insert word 1 freqmap)
                                         Just x  -> (Map.insert word $! x + 1) freqmap

--
-- | Word Category Frequency, modified version of wordFreq to 
-- handle Word Category type.
wordCatFreq :: [WordCat] -> [WordCatInfo]
wordCatFreq inlst = Map.toList $ foldl' 
                    updateMap (Map.empty :: Map.Map WordCat Int) inlst
    where updateMap freqmap wordcat = case (Map.lookup wordcat freqmap) of
                                        Nothing -> (Map.insert wordcat 1 freqmap)
                                        Just x  -> (Map.insert wordcat $! x + 1) freqmap

-- | Pretty print the word/count tuple and output a string.
formatWordFreq :: WordInfo -> String
formatWordFreq tupl = fst tupl ++ " " ++ (show $ snd tupl)

formatWordCat :: WordCatInfo -> String
formatWordCat tupl = frmtcat (fst tupl) ++ " " ++ (show $ snd tupl)
    where frmtcat infotupl = (fst infotupl) ++ ", " ++ (snd infotupl)

Utilities for calculating the fisher probability:


wordFreqSort :: [String] -> [(String, Int)]
wordFreqSort inlst = sortBy freqSort . wordFreq $ inlst

--
-- | bayes classification train 
trainClassify :: String -> String -> [WordCatInfo]
trainClassify content cat = let tokens = splitRegex (mkRegex "\\s*[ \t\n]+\\s*") content
                                wordcats = [(tok, cat) | tok <- tokens] 
                        in wordCatFreq wordcats

--
-- | Return only the tokens in a category.
tokensCat :: [WordCatInfo] -> String -> [WordCatInfo]
tokensCat tokens cat = let getTokCat row = snd (fst row)
                           tokbycat = filter (\x -> ((getTokCat x) == cat)) tokens
                       in tokbycat

tokensByFeature :: [WordCatInfo] -> String -> String -> [WordCatInfo]
tokensByFeature tokens tok cat = filter (\x -> ((fst x) == (tok, cat))) tokens

--
-- | Count of number of features in a particular category
-- Extract the first tuple to get the WordCat type and then the
-- second tuple to get the category.
catCount :: [WordCatInfo] -> String -> Integer
catCount tokens cat = genericLength $ tokensCat tokens cat

-- Find the distinct categories
categories :: [WordCatInfo] -> [String]
categories tokens = let getTokCat row = snd (fst row)                     
                        allcats = Set.toList . Set.fromList $ [ getTokCat x | x <- tokens ]
                    in allcats

featureCount :: [WordCatInfo] -> String -> String -> Integer
featureCount tokens tok cat = genericLength $ tokensByFeature tokens tok cat

--
-- | Feature probality, count in this category over total in category
featureProb :: [WordCatInfo] -> String -> String -> Double
featureProb features tok cat = let fct = featureCount features tok cat
                                   catct = catCount features cat
                               in (fromIntegral fct) / (fromIntegral catct)

--
-- | Calcuate the category probability
categoryProb :: [WordCatInfo] -> String -> String -> Double
categoryProb features tok cat = initfprob / freqsum
    where initfprob = featureProb features tok cat
          freqsum = sum [ (featureProb features tok x) | x <- categories features ]

weightedProb :: [WordCatInfo] -> String -> String -> Double -> Double
weightedProb features tok cat weight = ((weight*ap)+(totals*initprob))/(weight+totals)
    where initprob = categoryProb features tok cat
          ap = 0.5
          totals = fromIntegral $ sum [ (featureCount features tok x) | x <- categories features ]

-- Inverted Chi2 formula
invChi2 :: Double -> Double -> Double
invChi2 chi df = minimum([snd newsum, 1.0])
    where m = chi / 2.0
          initsum = exp (-m)
          trm = exp (-m)
          maxrg = fromIntegral (floor (df / 2.0)) :: Double
          -- Return a tuple with current sum and term, given these inputs
          newsum = foldl (\(trm,sm) elm -> ((trm*(m/elm)), sm+trm)) 
                   (trm,initsum) [1..maxrg]

fisherProb :: [WordCatInfo] -> [String] -> String -> Double
fisherProb features tokens cat = invchi
    where initw = 1.0
          p = foldl (\prb f -> (prb * (weightedProb features f cat initw))) 1.0 tokens
          fscore = (-2) * (log p)
          invchi = invChi2 fscore ((genericLength features) * 2)

Some example test cases:


simpleTest1 :: IO ()
simpleTest1 = do
  content <- readFile badfile
  let tokens = splitRegex (mkRegex "\\s*[ \t\n]+\\s*") content
      wordfreq = wordFreqSort tokens
  mapM_ (\x -> (putStrLn $ formatWordFreq x)) wordfreq
  putStrLn $ "Number of tokens found: " ++ (show . length $ wordfreq)

simpleTest2 :: IO ()
simpleTest2 = do
  let badfreq = trainClassify "viagra is bad cialis is good" "bad"
      goodfreq = trainClassify "I like to run with foxes they cool" "good"
      allfreq = badfreq ++ goodfreq
  mapM_ (\x -> (putStrLn $ formatWordCat x)) allfreq

simpleTest3 :: IO ()
simpleTest3 = do
  let aa = [(("1", "aa") :: (String, String), -1), (("2", "aa"), -1), (("3", "bb"), -1)]
      tokensAA = tokensCat aa "aa"
      countAA = catCount aa "aa"
      c = featureProb aa "1" "aa"
  putStrLn $ "-->" ++ (show countAA) ++ " // " ++ (show tokensAA) ++ " // " ++ (show c)

simpleTest4 :: IO ()
simpleTest4 = do
  let aa = [(("dogs dogs", "good") :: (String, String), 3), 
            (("viagra", "bad") :: (String, String), 5), 
            (("fox", "good") :: (String, String), 2), 
            (("dogs", "good"), 4), 
            (("3", "bad"), 5)]
      bb = categories aa
      tokensAA = tokensByFeature aa "dogs" "good"
      c = featureProb aa "dogs" "good"
      d = catCount aa "good"
      x = categoryProb aa "xdogs" "good"
      z = weightedProb aa "dogs" "good" 1.0
  putStrLn $ "-->" ++ (show d) ++ "//" ++ (show bb) ++ "//" ++ (show z) 

simpleTest5 :: IO ()
simpleTest5 = do
  let aa = [(("dogs dogs", "good") :: (String, String), 3), 
            (("viagra", "bad") :: (String, String), 5), 
            (("fox", "good") :: (String, String), 2), 
            (("dogs", "good"), 4), 
            (("3", "bad"), 5)]
      testdata = [ "xdog" ]
      bb = fisherProb aa testdata "bad"
  putStrLn $ "-->" ++ show bb

Search This Blog

Berlin Brown and Software Development

Haskell Snippet, start of bayes probability library

Comments

Popular posts from this blog

JVM Notebook: Basic Clojure, Java and JVM Language performance

On Unit Testing, Java TDD for developers to write

Is Java the new COBOL? Yes. What does that mean, exactly? (Part 1)