Haskell Snippet, start of bayes probability library

I am adding support for text analysis including bayes classification. The code is based on Toby Segaran's PCIL python source code and contains some of those utility functions.

Libraries used:

import System.Environment
import qualified Data.Map as Map
import qualified Data.Set as Set
import Data.List
import Text.Regex (splitRegex, mkRegex)

Type definitions:

type WordCat = (String, String)
type WordCatInfo = (WordCat, Int)
type WordInfo = (String, Int)

Utilities for finding the word frequency in a document:

--
-- | Find word frequency given an input list using "Data.Map" utilities.
-- With (Map.empty :: Map.Map String Int), set k = String and a = Int
-- Map.empty :: Map k a
-- foldl' is a strict version of foldl = foldl': (a -> b -> a) -> a -> [b] -> a
-- Also see: updmap nm key = Map.insertWith (+) key 1 nm
-- (Original code from John Goerzen's wordFreq)
wordFreq :: [String] -> [WordInfo]
wordFreq inlst = Map.toList $ foldl' updateMap (Map.empty :: Map.Map String Int) inlst
where updateMap freqmap word = case (Map.lookup word freqmap) of
Nothing -> (Map.insert word 1 freqmap)
Just x -> (Map.insert word $! x + 1) freqmap

--
-- | Word Category Frequency, modified version of wordFreq to
-- handle Word Category type.
wordCatFreq :: [WordCat] -> [WordCatInfo]
wordCatFreq inlst = Map.toList $ foldl'
updateMap (Map.empty :: Map.Map WordCat Int) inlst
where updateMap freqmap wordcat = case (Map.lookup wordcat freqmap) of
Nothing -> (Map.insert wordcat 1 freqmap)
Just x -> (Map.insert wordcat $! x + 1) freqmap

-- | Pretty print the word/count tuple and output a string.
formatWordFreq :: WordInfo -> String
formatWordFreq tupl = fst tupl ++ " " ++ (show $ snd tupl)

formatWordCat :: WordCatInfo -> String
formatWordCat tupl = frmtcat (fst tupl) ++ " " ++ (show $ snd tupl)
where frmtcat infotupl = (fst infotupl) ++ ", " ++ (snd infotupl)

Utilities for calculating the fisher probability:

wordFreqSort :: [String] -> [(String, Int)]
wordFreqSort inlst = sortBy freqSort . wordFreq $ inlst

--
-- | bayes classification train
trainClassify :: String -> String -> [WordCatInfo]
trainClassify content cat = let tokens = splitRegex (mkRegex "\\s*[ \t\n]+\\s*") content
wordcats = [(tok, cat) | tok <- tokens]
in wordCatFreq wordcats

--
-- | Return only the tokens in a category.
tokensCat :: [WordCatInfo] -> String -> [WordCatInfo]
tokensCat tokens cat = let getTokCat row = snd (fst row)
tokbycat = filter (\x -> ((getTokCat x) == cat)) tokens
in tokbycat

tokensByFeature :: [WordCatInfo] -> String -> String -> [WordCatInfo]
tokensByFeature tokens tok cat = filter (\x -> ((fst x) == (tok, cat))) tokens

--
-- | Count of number of features in a particular category
-- Extract the first tuple to get the WordCat type and then the
-- second tuple to get the category.
catCount :: [WordCatInfo] -> String -> Integer
catCount tokens cat = genericLength $ tokensCat tokens cat

-- Find the distinct categories
categories :: [WordCatInfo] -> [String]
categories tokens = let getTokCat row = snd (fst row)
allcats = Set.toList . Set.fromList $ [ getTokCat x | x <- tokens ]
in allcats

featureCount :: [WordCatInfo] -> String -> String -> Integer
featureCount tokens tok cat = genericLength $ tokensByFeature tokens tok cat

--
-- | Feature probality, count in this category over total in category
featureProb :: [WordCatInfo] -> String -> String -> Double
featureProb features tok cat = let fct = featureCount features tok cat
catct = catCount features cat
in (fromIntegral fct) / (fromIntegral catct)

--
-- | Calcuate the category probability
categoryProb :: [WordCatInfo] -> String -> String -> Double
categoryProb features tok cat = initfprob / freqsum
where initfprob = featureProb features tok cat
freqsum = sum [ (featureProb features tok x) | x <- categories features ]

weightedProb :: [WordCatInfo] -> String -> String -> Double -> Double
weightedProb features tok cat weight = ((weight*ap)+(totals*initprob))/(weight+totals)
where initprob = categoryProb features tok cat
ap = 0.5
totals = fromIntegral $ sum [ (featureCount features tok x) | x <- categories features ]

-- Inverted Chi2 formula
invChi2 :: Double -> Double -> Double
invChi2 chi df = minimum([snd newsum, 1.0])
where m = chi / 2.0
initsum = exp (-m)
trm = exp (-m)
maxrg = fromIntegral (floor (df / 2.0)) :: Double
-- Return a tuple with current sum and term, given these inputs
newsum = foldl (\(trm,sm) elm -> ((trm*(m/elm)), sm+trm))
(trm,initsum) [1..maxrg]

fisherProb :: [WordCatInfo] -> [String] -> String -> Double
fisherProb features tokens cat = invchi
where initw = 1.0
p = foldl (\prb f -> (prb * (weightedProb features f cat initw))) 1.0 tokens
fscore = (-2) * (log p)
invchi = invChi2 fscore ((genericLength features) * 2)


Some example test cases:

simpleTest1 :: IO ()
simpleTest1 = do
content <- readFile badfile
let tokens = splitRegex (mkRegex "\\s*[ \t\n]+\\s*") content
wordfreq = wordFreqSort tokens
mapM_ (\x -> (putStrLn $ formatWordFreq x)) wordfreq
putStrLn $ "Number of tokens found: " ++ (show . length $ wordfreq)

simpleTest2 :: IO ()
simpleTest2 = do
let badfreq = trainClassify "viagra is bad cialis is good" "bad"
goodfreq = trainClassify "I like to run with foxes they cool" "good"
allfreq = badfreq ++ goodfreq
mapM_ (\x -> (putStrLn $ formatWordCat x)) allfreq

simpleTest3 :: IO ()
simpleTest3 = do
let aa = [(("1", "aa") :: (String, String), -1), (("2", "aa"), -1), (("3", "bb"), -1)]
tokensAA = tokensCat aa "aa"
countAA = catCount aa "aa"
c = featureProb aa "1" "aa"
putStrLn $ "-->" ++ (show countAA) ++ " // " ++ (show tokensAA) ++ " // " ++ (show c)

simpleTest4 :: IO ()
simpleTest4 = do
let aa = [(("dogs dogs", "good") :: (String, String), 3),
(("viagra", "bad") :: (String, String), 5),
(("fox", "good") :: (String, String), 2),
(("dogs", "good"), 4),
(("3", "bad"), 5)]
bb = categories aa
tokensAA = tokensByFeature aa "dogs" "good"
c = featureProb aa "dogs" "good"
d = catCount aa "good"
x = categoryProb aa "xdogs" "good"
z = weightedProb aa "dogs" "good" 1.0
putStrLn $ "-->" ++ (show d) ++ "//" ++ (show bb) ++ "//" ++ (show z)

simpleTest5 :: IO ()
simpleTest5 = do
let aa = [(("dogs dogs", "good") :: (String, String), 3),
(("viagra", "bad") :: (String, String), 5),
(("fox", "good") :: (String, String), 2),
(("dogs", "good"), 4),
(("3", "bad"), 5)]
testdata = [ "xdog" ]
bb = fisherProb aa testdata "bad"
putStrLn $ "-->" ++ show bb

Comments

Popular posts from this blog

Is Java the new COBOL? Yes. What does that mean, exactly? (Part 1)

On Unit Testing, Java TDD for developers to write

JVM Notebook: Basic Clojure, Java and JVM Language performance