Add a prettier version of the parser

To be still done: - the high half implementing ChordPro v6 semantics - documentation of what we understand by the low half - split stuff into relevant files (half of gmpro/parse belongs to gmpro/parse/chordpro6) - environment grouping
3 months ago · c6d545833a
parent 83b98dbcdc
commit c6d545833a
3 changed files with 309 additions and 2 deletions
--- a/gmpro/src/gmpro/demo.gleam
+++ b/gmpro/src/gmpro/demo.gleam
@ -1,4 +1,4 @@
-import gmpro
+import gmpro/parse
 import simplifile
 import gleam/list
 import gleam/io
@ -16,7 +16,7 @@ pub fn main() {
 			io.debug(err)
 			"simplifile failed."
 			})
-		|> result.map(gmpro.parse_base)
+		|> result.map(parse.parse_base)
 	case verdict {
 		Ok(result) -> io.println(pprint.styled(result))
 		Error(err) -> io.println_error(err)
--- a/gmpro/src/gmpro/parse.gleam
+++ b/gmpro/src/gmpro/parse.gleam
@ -0,0 +1,281 @@
 import gleam/int
 import gleam/result
 import gleam/list
 import gleam/regex
 import gleam/string
 import gleam/option.{type Option, Some, None}
 import gmpro/utils
 import gleam/io
 pub type DirectiveArgument {
 	Flag(key: String)
 	Option(key: String, value: String)
 }
 pub type Directive {
 	Directive(
 		name: String,
 		condition: Option(#(String, Bool)), // False when negated
 		arguments: List(DirectiveArgument),
 	)
 }
 pub type Environment {
 	Environment(
 		start_directive: Directive,
 		name: String,
 		contents: List(String),
 	)
 }
 // This time we will not accumulate environments, only detect them!
 pub type LineType {
 	ChordLine(text: String)
 	DirectiveLine(Directive)
 	Comment(text: String)
 	EmptyLine
 	EnvironmentLine(text: String)
 }
 pub fn canonical_directive(dir: Directive) -> Directive {
 	// We need to get canonical name early, because we need to be able to match {soc} to {end_of_chorus} etc.
 	// TODO: save orig_name?
 	let #(correct_name, new_args) = {
 		case dir.name {
 			// https://www.chordpro.org/chordpro/directives-env/
 			"soc" -> #("start_of_chorus", [])
 			"eoc" -> #("end_of_chorus", [])
 			"sov" -> #("start_of_verse", [])
 			"eov" -> #("end_of_verse", [])
 			"sob" -> #("start_of_bridge", [])
 			"eob" -> #("end_of_bridge", [])
 			"sot" -> #("start_of_tab", [])
 			"eot" -> #("end_of_tab", [])
 			"sog" -> #("start_of_grid", [])
 			"eog" -> #("endt_of_grid", [])
 			// https://www.chordpro.org/chordpro/directives-meta/
 			"title" -> #("meta", [Flag("title")])
 			"sorttitle" -> #("meta", [Flag("sorttitle")])
 			"subtitle" -> #("meta", [Flag("subtitle")])
 			"artist" -> #("meta", [Flag("artist")])
 			"composer" -> #("meta", [Flag("composer")])
 			"lyricist" -> #("meta", [Flag("lyricist")])
 			"arranger" -> #("meta", [Flag("arranger")])
 			"copyright" -> #("meta", [Flag("copyright")])
 			"album" -> #("meta", [Flag("album")])
 			"year" -> #("meta", [Flag("year")])
 			"key" -> #("meta", [Flag("key")])
 			"time" -> #("meta", [Flag("time")])
 			"tempo" -> #("meta", [Flag("tempo")])
 			"duration" -> #("meta", [Flag("duration")])
 			"capo" -> #("meta", [Flag("capo")])
 			// https://www.chordpro.org/chordpro/chordpro-directives/
 			"ns" -> #("new_song", [])
 			"t" -> #("meta", [Flag("title")])
 			"st" -> #("meta", [Flag("subtitle")])
 			"c" -> #("comment", [])
 			"ci" -> #("comment_italic", [])
 			"cb" -> #("comment_box", [])
 			"cf" -> #("chordfont", [])
 			"cs" -> #("chordsize", [])
 			"tf" -> #("textfont", [])
 			"ts" -> #("textsize", [])
 			"np" -> #("new_page", [])
 			"npp" -> #("new_physical_page", [])
 			"colb" -> #("column_break", [])
 			"g" -> #("grid", [])
 			"ng" -> #("no_grid", [])
 			"col" -> #("columns", [])
 			//https://www.chordpro.org/chordpro/directives-comment/
 			"highlight" -> #("comment", [])
 			// Other cases: don't change
 			name -> #(name, [])
 		}
 	}
 	Directive(..dir, name: correct_name, arguments: list.append(new_args, dir.arguments))
 }
 // NOTE: we want to have this function be user-supplied in the actuall low-level parser to allow deviations.
 // We do not add semantics here, therefore we do *not* yet know which parts of the directive form one argument (i.e. `{meta title Twinkle Twinkle Little Star}`)
 const directive_regex = "{([a-zA-Z0-9_]+)(-([a-zA-Z0-9_]+)(!?))?([: \t]+([^}]*))?}"
 pub fn parse_directive(str: String) -> Result(Directive, String) {
 	// I don't believe this gets any simpler. It's just regexes and the handling is just mildly painful.
 	// This might have been a bit simpler if we used string.split for everything, which probably would be possible, but also the code would probably be longer. To be maybe refactored later…
 	let assert Ok(re) = regex.from_string(directive_regex)
 	let matches = regex.scan(re, str)
 	case matches {
 		[] -> Error("This does not match the directive regex")
 		[_a, _b, ..] -> Error("Somehow, this matches the regex multiple times. (Maybe there are multiple directives on a single line?)")
 		[match] -> {
 			let padded = utils.pad_list(match.submatches, with: None, to: 6)
 			case padded {
 				[name, _group, condition, invert, _group, attr] -> {
 					let assert Some(name) = name // Cannot fail, we matched something.
 					let condition = case condition, invert {
 						Some(cond), inv -> Ok(Some(#(cond, option.is_none(inv))))
 						None, None -> Ok(None)
 						None, Some(_inv) -> Error("Condition only has negation without selector.")
 					}
 					let attr = case attr {
 						None -> []
 						Some(x) -> [Flag(x)]
 					}
 					result.try(condition, fn(condition) {Ok(Directive(name:, condition:, arguments: attr))})
 				}
 				_ -> panic as "submatches for did not work in parse_directive"
 			}
 		}
 	}
 }
 // -------------------------
 // TEMP!
 fn environment_ends(d: Directive) -> Option(String) {
 	case d.name {
 		"start_of_" <> something -> Some("{end_of_" <> something <> "}")
 		_ -> None
 	}
 }
 pub fn parse_base(s: String) -> Result(List(LineType), String) {
 	parse_detailed(s, canonical_directive, environment_ends)
 }
 type EnvironmentChange {
 	KeepEnv
 	ExitEnv
 	EnterEnv(end: String)
 }
 fn parse_line_outside(
 			line: String,
 			canonical_directives: fn(Directive) -> Directive,
 			is_environment: fn(Directive) -> Option(String),
 		) -> Result(#(EnvironmentChange, LineType), String) {
 	case line {
 		"" -> Ok(#(KeepEnv, EmptyLine))
 		"#" <> _something -> Ok(#(KeepEnv, Comment(line)))
 		"{" <> _something -> {
 			use directive <- result.try(parse_directive(line))
 			let directive =
 				directive
 				|> canonical_directives
 			let is_env = is_environment(directive)
 			case is_env {
 				None -> Ok(#(KeepEnv, DirectiveLine(directive)))
 				Some(env_end) -> Ok(#(EnterEnv(env_end), DirectiveLine(directive)))
 			}
 		}
 		_ -> Ok(#(KeepEnv, ChordLine(line)))
 	}
 }
 fn parse_line_inside(
 			line: String,
 			expect_end: String,
 			canonical_directives: fn(Directive) -> Directive,
 			is_environment: fn(Directive) -> Option(String),
 		) -> Result(#(EnvironmentChange, LineType), String) {
 	case line {
 		line if line == expect_end -> {
 			use directive <- result.try(parse_directive(line))
 			Ok(#(ExitEnv, DirectiveLine(directive)))
 		}
 		"{" <> _something -> {
 			// It may be a nested env. Otherwise it is an EnvironmentLine.
 			case parse_directive(line) {
 				Error(_) -> Ok(#(KeepEnv, EnvironmentLine(line)))
 				Ok(dir) -> {
 					// Why is this so nested? (we could have downgraded the Error to None though…)
 					let canondir = canonical_directives(dir)
 					case is_environment(canondir) {
 						None -> Ok(#(KeepEnv, EnvironmentLine(line)))
 						Some(end) -> {
 							// TODO: add line number to the warning!
 							io.println_error("Warning: nested environment: " <> line)
 							Ok(#(EnterEnv(end), DirectiveLine(canondir)))
 						}
 					}
 				}
 			}
 		}
 		_ -> Ok(#(KeepEnv, EnvironmentLine(line)))
 	}
 }
 type FoldAccumulator {
 	FoldAccumulator(end_stack: List(String), parsed_lines: List(LineType))
 	// This represents a valid parse, so when folding this is wrapped into a Result.
 }
 fn fold_func(
 			acc: FoldAccumulator,
 			item: String,
 			line_number: Int,
 			canonical_directives: fn(Directive) -> Directive,
 			is_environment: fn(Directive) -> Option(String),
 		) -> Result(FoldAccumulator, String) {
 	// I pray this does not end up as horrible as The Horrible Fold in gmpro.gleam :-)
 	let with_line_number = fn(s) { "line: " <> int.to_string(line_number) <> ": " <> s }
 	let top_end = list.first(acc.end_stack)
 	let rest_end: List(String) = list.rest(acc.end_stack)
 		|> result.replace_error([])
 		|> result.unwrap_both
 	let new_parsed_line: Result(#(EnvironmentChange, LineType), String) = case top_end { // how should the end_stack change + the parsed line or an error.
 		Error(Nil) -> parse_line_outside(item, canonical_directives, is_environment)
 		Ok(end) -> parse_line_inside(item, end, canonical_directives, is_environment)
 	} |> result.map_error(with_line_number)
 	use #(dir, lt) <- result.try(new_parsed_line)
 	// We good, just return Ok(FoldAccumulator)
 	let new_endstack = case dir, top_end {
 		// This feels ugly.
 		KeepEnv, Ok(top) -> [top, ..rest_end]
 		KeepEnv, Error(Nil) -> rest_end // == []
 		EnterEnv(end), Ok(top) -> [end, top, ..rest_end]
 		EnterEnv(end), Error(Nil) -> [end, ..rest_end] // == [end]
 		ExitEnv, _ -> rest_end
 	}
 	let new_lines = [lt, ..acc.parsed_lines]
 	Ok(FoldAccumulator(new_endstack, new_lines))
 }
 fn fold_func_embed(
 			f: fn(
 					FoldAccumulator,
 					String,
 					Int,
 					fn(Directive) -> Directive,
 					fn(Directive) -> Option(String)
 				) -> Result(FoldAccumulator, String),
 			canonical_directives: fn(Directive) -> Directive,
 			is_environment: fn(Directive) -> Option(String),
 		) -> fn(FoldAccumulator, String, Int) -> Result(FoldAccumulator, String) {
 	fn(acc: FoldAccumulator, item: String, line_number: Int) -> Result(FoldAccumulator, String) {
 		f(acc, item, line_number, canonical_directives, is_environment)
 	}
 }
 /// This parser lets the caller specify how all the parts are parsed and
 /// what the canonical names are.
 pub fn parse_detailed(
 			input: String,
 			canonical_directives: fn(Directive) -> Directive,
 			is_environment: fn(Directive) -> Option(String),
 		) -> Result(List(LineType), String) {
 	let line_types =
 		input
 		|> string.split(on: "\n")
 		|> list.map(string.trim)
 		|> utils.fold_try_varindex(from: FoldAccumulator([], []), starting_with: 1, with: fold_func_embed(fold_func, canonical_directives, is_environment))
 	// For some reason I cannot `|> result.try()`, which makes me sad, so it goes to the line below :-)
 		|> io.debug
 	use line_types <- result.try(line_types)
 	case line_types {
 		FoldAccumulator([], lines) -> Ok(list.reverse(lines)) // It is faster to add lines to front!
 		_ -> Error("Non-empty fold accumulator: " <> string.inspect(line_types))
 	}
 }
--- a/gmpro/src/gmpro/utils.gleam
+++ b/gmpro/src/gmpro/utils.gleam
@ -1,6 +1,8 @@
 //// Various utilities.
 import gleam/list.{type ContinueOrStop, Stop, Continue}
 import gleam/int
 import gleam/result
 /// list.fold that can both terminate early and pass index to the folding function, with an option to set the initial index
 pub fn fold_until_varindex(
@ -20,3 +22,27 @@ pub fn fold_until_varindex(
 		}
 	}
 }
 // maybe we could have reused the above, but whatever…
 pub fn fold_try_varindex(
 			over collection: List(a),
 			from accumulator: b,
 			starting_with index: Int,
 			with fun: fn(b, a, Int) -> Result(b, c),
 		) -> Result(b, c) {
 	case collection {
 		[] -> Ok(accumulator)
 		[x, ..xs] -> {
 			use new_acc <- result.try(fun(accumulator, x, index))
 			fold_try_varindex(xs, new_acc, index+1, fun)
 		}
 	}
 }
 /// pads list from the right to the given length. Useful for matching regexes.
 pub fn pad_list(list: List(a), with item: a, to length: Int) -> List(a) {
 	let reversed = list.reverse(list)
 	let number_to_add = { length - list.length(list) } |> int.max(0)
 	let extended = list.fold(over: list.repeat(Nil, times: number_to_add), from: reversed, with: fn(list, _nil) {[item, ..list]})
 	list.reverse(extended)
 }