Haskell Snippet; cat data in files
It is always useful to manipulate files. This set of code (runTestTrainBayes) gets all the files in the relative directory path "train"; calls the anonymous function on each file and reads the content; appends the content into one big string. workTokens returns only tokens that are greater than 1 and less than 100. Eventually, you end up with a list of word tokens.
Update: Simpler Example
Update: The snippet below shows how to associate the content of the file with the filename
import System.Directory (getDirectoryContents)
import List (isPrefixOf, isSuffixOf)
import Data.SpiderNet.Bayes
trainDir = "train"
wordTokens :: String -> [String]
wordTokens content = tokens
where maxwordlen = 100
lowercase str = map toLower str
alltokens = splitRegex (mkRegex "\\s*[ \t\n]+\\s*") (lowercase content)
tokens = filter (\x -> length x > 1 && length x < maxwordlen) alltokens
runTestTrainBayes :: IO ()
runTestTrainBayes = do
putStrLn "Test Train Bayes"
files <- getDirectoryContents trainDir
let trainfiles = filter (isSuffixOf ".train") files
trainpaths = map (\x -> trainDir ++ "/" ++ x) trainfiles
d <- concatMap lines `fmap` mapM readFile trainpaths
let z = wordTokens (concat d)
putStrLn $ show z
Update: Simpler Example
runTestTrainBayes :: IO ()
runTestTrainBayes = do
putStrLn "Test Train Bayes"
files <- getDirectoryContents trainDir
let trainfiles = filter (isSuffixOf ".train") files
trainpaths = map (\x -> trainDir ++ "/" ++ x) trainfiles
d <- mapM readFile trainpaths
putStrLn $ show (length d)
Update: The snippet below shows how to associate the content of the file with the filename
files <- getDirectoryContents trainDir
let trainfiles = filter (isSuffixOf ".train") files
trainpaths = map (\x -> trainDir ++ "/" ++ x) trainfiles
d <- liftM (zip trainpaths) $ mapM readFile trainpaths
Comments