Commit 8aa59040 authored by Thorsten Wißmann's avatar Thorsten Wißmann 🐧

Removing rule duplicates in .grammar files

parent 2145d5cb
......@@ -8,6 +8,7 @@ import qualified Data.Text.IO as T
import qualified Data.Text as T
import Data.Text (Text)
import Data.Void
import qualified Data.Maybe as Maybe
import qualified Data.Set as S
import qualified Data.Map.Strict as M
......@@ -48,6 +49,9 @@ optparseInfo = OptParse.info (optionsParser <**> OptParse.helper)
-- ================ Parsing the berkeleyparser file format ================
-- ========================================================================
type Comment = String
data Identifier = Identifier
{ identifierSymbol :: Text
, identifierIndex :: Integer
......@@ -58,7 +62,7 @@ data Rule = Rule
, ruleRight :: [Identifier]
, ruleWeight :: Double
}
data Grammar = Grammar [Rule]
data Grammar = Grammar [Comment] [Rule]
instance Show Identifier where
show (Identifier i n) = T.unpack i ++ show n
......@@ -67,13 +71,15 @@ instance Show Rule where
show (Rule left right weight) = show left ++ " --" ++ show weight ++ "--> " ++ show right
instance Show Grammar where
show (Grammar rules) = "[" ++ concat (L.intersperse "\n," (map ((++) " " . show) rules)) ++ "\n]"
show (Grammar comments rules) =
concat ((map ((++) "# " . flip (++) "\n" . show) comments))
++ "[" ++ concat (L.intersperse "\n," (map ((++) " " . show) rules)) ++ "\n]"
type Parser x = Parsec Void Text x
grammarParser :: Parser Grammar
grammarParser =
Grammar <$> rule `endBy` newline <* eof
Grammar [] <$> rule `endBy` newline <* eof
where
rule = do
left <- identifier
......@@ -99,11 +105,37 @@ grammarParser =
double :: Parser Double
double = read <$> many1 (oneOf "0123456789-.E")
-- | some input files have duplicate rules (for unary rules)
-- and these duplicates should be ignored
--
-- we keep the first "ruleLeft -> ruleRight weight" but remove all
-- later occurences.
removeDuplicates :: Grammar -> Grammar
removeDuplicates (Grammar comments rules) = flip State.evalState M.empty $ do
-- our state is a map sending (ruleLeft, ruleRight) to the list of rules.
newRules <-
fmap Maybe.catMaybes $ forM rules $ \r -> do
let key = (ruleLeft r, ruleRight r)
exists <- gets $ M.findWithDefault [] key
modify $ M.insert key (exists ++ [r])
return $ case exists of
[] -> Just r -- first occurrence, keep!
(_:_) -> Nothing -- duplicate occurrence, drop!
finalState <- State.get
let newComments =
map (\dups -> "Removed rule duplicates: " ++ (concat $ L.intersperse ", " $ map show dups))
$ filter (\dups -> length dups > 1)
$ map snd
$ M.toList finalState
return $ Grammar (comments ++ newComments) newRules
-- ================ Transforming berkeleyparser to copar data ================
-- ===========================================================================
type Comment = String
-- | a WTA as a R^(Sigma (-))-coalgebra where the states
-- have the type 'ident'
data CoalgebraWTA ident = CoalgebraWTA
......@@ -126,7 +158,7 @@ coalgebraPrependComment comment coalg =
coalg { coalgComments = comment ++ coalgComments coalg }
berkeleyToCopar :: Grammar -> CoalgebraWTA Identifier
berkeleyToCopar (Grammar rules) = CoalgebraWTA
berkeleyToCopar (Grammar comments rules) = CoalgebraWTA
{ coalgArities = map snd arities
, coalgSymbol2index = M.empty
, coalgComments =
......@@ -138,6 +170,8 @@ berkeleyToCopar (Grammar rules) = CoalgebraWTA
" inj" ++ show idx ++ " corresponds to " ++ T.unpack symbol ++ " with arity " ++ show arity
| ((symbol, arity), idx) <- M.toList symbolarity2index
]
++ (if length comments > 0 then [""] else [])
++ comments
, coalgStructure = structure
, coalgIdent2Comment = M.fromList ident2comment
......@@ -335,6 +369,7 @@ main = do
$ coalgebraPrependComment [""]
$ coalgebraPrependComment formatComment
$ (if (optCompressNames opts) then compressCoalgebra2Ident else sanitizeNames)
$ berkeleyToCopar grammar
$ berkeleyToCopar
$ removeDuplicates grammar
-- printGrammarForCopar filename putStrLn grammar
return ()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment