elipl/lib/til/parser.ex
Kacper Marzecki 748f87636a checkpoint
checkpoint

failing test

after fixing tests

checkpoint

checkpoint

checkpoint

re-work

asd

checkpoint

checkpoint

checkpoint

mix proj

checkpoint mix

first parser impl

checkpoint

fix tests

re-org parser

checkpoint strings

fix multiline strings

tuples

checkpoint maps

checkpoint

checkpoint

checkpoint

checkpoint

fix weird eof expression parse error

checkpoint before typing

checkpoint

checpoint

checkpoint

checkpoint

checkpoint ids in primitive types

checkpoint

checkpoint

fix tests

initial annotation

checkpoint

checkpoint

checkpoint

union subtyping

conventions

refactor - split typer

typing tuples

checkpoint test refactor

checkpoint test refactor

parsing atoms

checkpoint atoms

wip lists

checkpoint typing lists

checkopint

checkpoint

wip fixing

correct list typing

map discussion

checkpoint map basic typing

fix tests checkpoint

checkpoint

checkpoint

checkpoint

fix condition typing

fix literal keys in map types

checkpoint union types

checkpoint union type

checkpoint row types discussion & bidirectional typecheck

checkpoint

basic lambdas

checkpoint lambdas typing application

wip function application

checkpoint

checkpoint

checkpoint cduce

checkpoint

checkpoint

checkpoint

checkpoint

checkpoint

checkpoint

checkpoint
2025-06-13 23:48:07 +02:00

964 lines
34 KiB
Elixir

defmodule Til.Parser do
@moduledoc """
Parser for the Tilly Lisp dialect.
It transforms source code into a collection of Node Maps.
"""
# Represents the current parsing position
defstruct offset: 0, line: 1, col: 1, file_name: "unknown", nodes: %{}
@doc """
Parses a source string into a map of AST nodes.
"""
def parse(source_string, file_name \\ "unknown") do
file_node_id = System.unique_integer([:monotonic, :positive])
# Initial location for the file node (starts at the beginning)
file_start_offset = 0
file_start_line = 1
file_start_col = 1
# End location and raw_string will be finalized after parsing all content
prelim_file_node = %{
id: file_node_id,
type_id: nil,
# File node is the root
parent_id: nil,
file: file_name,
# End TBD
location: [file_start_offset, file_start_line, file_start_col, 0, 0, 0],
# TBD
raw_string: "",
ast_node_type: :file,
# TBD
children: [],
parsing_error: nil
}
initial_state = %__MODULE__{
file_name: file_name,
# Add prelim file node
nodes: %{file_node_id => prelim_file_node},
# Initial state offset should be 0 for the file
offset: 0,
# Initial state line should be 1
line: 1,
# Initial state col should be 1
col: 1
}
# Pass original_source_string for raw_string extraction, and file_node_id as parent for top-level exprs
final_state_after_expressions =
parse_all_expressions(source_string, source_string, initial_state, file_node_id)
# Finalize the file node
# Calculate end position of the entire source string
{file_end_line, file_end_col} = calculate_new_line_col(source_string, 1, 1)
# Offset is 0-indexed, length is the count of characters, so end_offset is length.
file_end_offset = String.length(source_string)
# Collect children of the file node
file_children_ids =
final_state_after_expressions.nodes
|> Map.values()
|> Enum.filter(&(&1.parent_id == file_node_id))
# Sort by start offset to maintain order of appearance in the source
|> Enum.sort_by(fn node -> hd(node.location) end)
|> Enum.map(& &1.id)
updated_file_node =
final_state_after_expressions.nodes
|> Map.get(file_node_id)
|> Map.merge(%{
location: [
file_start_offset,
file_start_line,
file_start_col,
file_end_offset,
file_end_line,
file_end_col
],
# The entire source is the raw string of the file node
raw_string: source_string,
children: file_children_ids
})
final_nodes =
Map.put(final_state_after_expressions.nodes, file_node_id, updated_file_node)
{:ok, final_nodes}
end
# --- Main Parsing Logic ---
# original_source_string is the complete initial source, source_string is the current remainder
# parent_id_for_top_level_expressions is the ID of the node that top-level expressions should be parented to (e.g., the :file node)
defp parse_all_expressions(
original_source_string,
source_string,
state,
parent_id_for_top_level_expressions
) do
case skip_whitespace(source_string, state) do
{:eos, final_state} ->
final_state
{:ok, remaining_source, current_state} ->
if remaining_source == "" do
# All content parsed, nothing left after skipping whitespace.
# This is a successful termination of parsing for the current branch.
current_state
else
# There's actual content to parse.
case parse_datum(
original_source_string,
remaining_source,
current_state,
parent_id_for_top_level_expressions
) do
{:ok, _node_id, next_source, next_state} ->
parse_all_expressions(
original_source_string,
next_source,
next_state,
parent_id_for_top_level_expressions
)
{:error_node, _node_id, _reason, next_source, next_state} ->
# An error node was created by parse_datum.
# Input was consumed. Continue parsing from next_source.
parse_all_expressions(
original_source_string,
next_source,
next_state,
parent_id_for_top_level_expressions
)
# NOTE: This relies on parse_datum and its components (like create_error_node_and_advance)
# to always consume input if source_string is not empty. If parse_datum could return
# :error_node without consuming input on a non-empty string, an infinite loop is possible.
# Current implementation of create_error_node_and_advance consumes 1 char.
end
end
end
end
# Parses a single datum: an atom (integer, symbol) or a list.
defp parse_datum(original_source_string, source, state, parent_id) do
# Peek for multi-character tokens first
cond do
String.starts_with?(source, "m{") ->
# Returns {:ok | :error_node, ...}
parse_map_expression(original_source_string, source, state, parent_id)
# Fallback to single character dispatch
true ->
char = String.first(source)
cond do
char == "(" ->
# Returns {:ok | :error_node, ...}
parse_s_expression(original_source_string, source, state, parent_id)
char == ")" ->
# Unexpected closing parenthesis, consume 1 char for the error token ')'
# Returns {:error_node, ...}
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected ')'")
char == "[" ->
# Returns {:ok | :error_node, ...}
parse_list_expression(original_source_string, source, state, parent_id)
char == "]" ->
# Unexpected closing square bracket, consume 1 char for the error token ']'
# Returns {:error_node, ...}
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected ']'")
# For tuples
char == "{" ->
# Returns {:ok | :error_node, ...}
parse_tuple_expression(original_source_string, source, state, parent_id)
char == "}" ->
# Unexpected closing curly brace
# Returns {:error_node, ...}
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected '}'")
char == "'" ->
# Returns {:ok | :error_node, ...}
parse_string_datum(original_source_string, source, state, parent_id)
char == ":" ->
# If the first char is ':', try to parse as an atom like :foo
case parse_atom_datum(source, state, parent_id) do
{:ok, node_id, rest, new_state} ->
{:ok, node_id, rest, new_state}
{:error, :not_atom} ->
# Failed to parse as a specific atom (e.g. ":foo").
# It could be a symbol that starts with ':' (e.g. if we allow ":" as a symbol).
# Fallback to general symbol parsing. Integer parsing won't match if it starts with ':'.
case parse_symbol_datum(source, state, parent_id) do
{:ok, node_id, rest, new_state} ->
{:ok, node_id, rest, new_state}
{:error, :not_symbol} ->
# If it started with ':' but wasn't a valid atom and also not a valid symbol
create_error_node_and_advance(source, state, parent_id, 1, "Unknown token starting with ':'")
end
end
true ->
# Default case for other characters
# Try parsing as an integer first
case parse_integer_datum(source, state, parent_id) do
{:ok, node_id, rest, new_state} ->
{:ok, node_id, rest, new_state}
{:error, :not_integer} ->
# Not an integer, try parsing as a symbol
case parse_symbol_datum(source, state, parent_id) do
{:ok, node_id, rest, new_state} ->
{:ok, node_id, rest, new_state}
{:error, :not_symbol} ->
# Not a symbol either. Consume 1 char for the unknown token.
create_error_node_and_advance(source, state, parent_id, 1, "Unknown token")
end
end
end # end inner cond
end # end outer cond
end
# --- Datum Parsing Helpers --- (parse_string_datum, process_string_content)
defp parse_string_datum(_original_source_string, source, state, parent_id) do
# state is before consuming "'"
initial_state_for_token = state
strip_indent = initial_state_for_token.col - 1
# Consume opening "'"
{opening_tick, source_after_opening_tick} = String.split_at(source, 1)
case :binary.match(source_after_opening_tick, "'") do
:nomatch ->
# Unclosed string
content_segment = source_after_opening_tick
raw_token = opening_tick <> content_segment
state_at_node_end = advance_pos(initial_state_for_token, raw_token)
location = [
initial_state_for_token.offset,
initial_state_for_token.line,
initial_state_for_token.col,
state_at_node_end.offset,
state_at_node_end.line,
state_at_node_end.col
]
processed_value = process_string_content(content_segment, strip_indent)
{node_id, state_with_error_node} =
add_node(
initial_state_for_token,
parent_id,
location,
raw_token,
:literal_string,
%{value: processed_value, parsing_error: "Unclosed string literal"}
)
final_state = %{
state_with_error_node
| offset: state_at_node_end.offset,
line: state_at_node_end.line,
col: state_at_node_end.col
}
{:error_node, node_id, "Unclosed string literal", "", final_state}
# _tick_length will be 1 for "`"
{idx_closing_tick_in_segment, _tick_length} ->
content_segment =
String.slice(source_after_opening_tick, 0, idx_closing_tick_in_segment)
closing_tick = "'"
raw_token = opening_tick <> content_segment <> closing_tick
rest_of_source =
String.slice(source_after_opening_tick, (idx_closing_tick_in_segment + 1)..-1)
state_at_node_end = advance_pos(initial_state_for_token, raw_token)
location = [
initial_state_for_token.offset,
initial_state_for_token.line,
initial_state_for_token.col,
state_at_node_end.offset,
state_at_node_end.line,
state_at_node_end.col
]
processed_value = process_string_content(content_segment, strip_indent)
{new_node_id, state_with_node} =
add_node(
initial_state_for_token,
parent_id,
location,
raw_token,
:literal_string,
%{value: processed_value}
)
final_state = %{
state_with_node
| offset: state_at_node_end.offset,
line: state_at_node_end.line,
col: state_at_node_end.col
}
{:ok, new_node_id, rest_of_source, final_state}
end
end
defp process_string_content(content_str, strip_indent) when strip_indent >= 0 do
lines = String.split(content_str, "\n", trim: false)
# Will always exist, even for empty content_str -> ""
first_line = List.first(lines)
rest_lines =
if length(lines) > 1 do
List.delete_at(lines, 0)
else
[]
end
processed_rest_lines =
Enum.map(rest_lines, fn line ->
current_leading_spaces_count =
Regex.run(~r/^(\s*)/, line)
|> List.first()
|> String.length()
spaces_to_remove = min(current_leading_spaces_count, strip_indent)
String.slice(line, spaces_to_remove..-1)
end)
all_processed_lines = [first_line | processed_rest_lines]
Enum.join(all_processed_lines, "\n")
end
# --- Datum Parsing Helpers --- (parse_string_datum, process_string_content)
# (parse_string_datum remains unchanged)
defp parse_atom_datum(source, state, parent_id) do
# Atom is a colon followed by one or more non-delimiter characters.
# Delimiters are whitespace, (, ), [, ], {, }.
# The colon itself is part of the atom's raw string.
# The `atom_name_part` is what comes after the colon.
case Regex.run(~r/^:([^\s\(\)\[\]\{\}]+)/, source) do
[raw_atom_str, atom_name_part] -> # raw_atom_str is like ":foo", atom_name_part is "foo"
# The regex [^...]+ ensures atom_name_part is not empty.
rest_after_atom = String.slice(source, String.length(raw_atom_str)..-1)
start_offset = state.offset
start_line = state.line
start_col = state.col
state_after_token = advance_pos(state, raw_atom_str)
end_offset = state_after_token.offset
end_line = state_after_token.line
end_col = state_after_token.col
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
# Convert the name part (e.g., "foo") to an Elixir atom (e.g., :foo)
atom_value = String.to_atom(atom_name_part)
{new_node_id, state_with_node} =
add_node(
state,
parent_id,
location,
raw_atom_str,
:literal_atom,
%{value: atom_value}
)
final_state = %{
state_with_node
| offset: end_offset,
line: end_line,
col: end_col
}
{:ok, new_node_id, rest_after_atom, final_state}
_ -> # No match (nil or list that doesn't conform, e.g., just ":" or ": followed by space/delimiter")
{:error, :not_atom}
end
end
defp parse_integer_datum(source, state, parent_id) do
case Integer.parse(source) do
{int_val, rest_after_int} ->
raw_int =
String.slice(source, 0, String.length(source) - String.length(rest_after_int))
start_offset = state.offset
start_line = state.line
start_col = state.col
state_after_token = advance_pos(state, raw_int)
end_offset = state_after_token.offset
end_line = state_after_token.line
end_col = state_after_token.col
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
{new_node_id, state_with_node} =
add_node(state, parent_id, location, raw_int, :literal_integer, %{value: int_val})
# Update state to reflect consumed token
final_state = %{state_with_node | offset: end_offset, line: end_line, col: end_col}
{:ok, new_node_id, rest_after_int, final_state}
:error ->
# Indicates failure, source and state are unchanged by this attempt
{:error, :not_integer}
end
end
defp parse_symbol_datum(source, state, parent_id) do
# Regex excludes common delimiters. `m{` is handled before symbol parsing.
case Regex.run(~r/^([^\s\(\)\[\]\{\}]+)/, source) do
[raw_symbol | _] ->
rest_after_symbol = String.slice(source, String.length(raw_symbol)..-1)
start_offset = state.offset
start_line = state.line
start_col = state.col
state_after_token = advance_pos(state, raw_symbol)
end_offset = state_after_token.offset
end_line = state_after_token.line
end_col = state_after_token.col
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
{new_node_id, state_with_node} =
add_node(state, parent_id, location, raw_symbol, :symbol, %{name: raw_symbol})
# Update state to reflect consumed token
final_state = %{
state_with_node
| offset: end_offset,
line: end_line,
col: end_col
}
{:ok, new_node_id, rest_after_symbol, final_state}
nil ->
# Indicates failure, source and state are unchanged by this attempt
{:error, :not_symbol}
end
end
defp create_error_node_and_advance(
source_for_token,
state_before_token,
parent_id,
num_chars_for_token,
error_message
) do
{raw_token, rest_of_source} = String.split_at(source_for_token, num_chars_for_token)
start_offset = state_before_token.offset
start_line = state_before_token.line
start_col = state_before_token.col
state_after_token_consumed = advance_pos(state_before_token, raw_token)
end_offset = state_after_token_consumed.offset
end_line = state_after_token_consumed.line
end_col = state_after_token_consumed.col
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
{error_node_id, state_with_error_node} =
add_node(state_before_token, parent_id, location, raw_token, :unknown, %{
parsing_error: error_message
})
# The state for further parsing must reflect the consumed token's position and include the new error node
final_error_state = %{
state_with_error_node
| offset: end_offset,
line: end_line,
col: end_col
}
{:error_node, error_node_id, error_message, rest_of_source, final_error_state}
end
defp parse_s_expression(original_source_string, source, state, parent_id) do
# Standard S-expression parsing via parse_collection
result = parse_collection(
original_source_string,
source,
state,
parent_id,
"(",
")",
:s_expression,
"Unclosed S-expression",
"Error parsing element in S-expression. Content might be incomplete."
)
# After parsing, check if it's an 'fn' expression
case result do
{:ok, collection_node_id, rest_after_collection, state_after_collection} ->
collection_node = Map.get(state_after_collection.nodes, collection_node_id)
if is_fn_expression?(collection_node, state_after_collection.nodes) do
transformed_node =
transform_to_lambda_expression(collection_node, state_after_collection.nodes)
final_state = %{
state_after_collection
| nodes:
Map.put(state_after_collection.nodes, transformed_node.id, transformed_node)
}
{:ok, transformed_node.id, rest_after_collection, final_state}
else
# Not an fn expression, return as is
result
end
_error_or_other ->
# Propagate errors or other results from parse_collection
result
end
end
# Helper to check if an S-expression node is an 'fn' expression
defp is_fn_expression?(s_expr_node, nodes_map) do
if s_expr_node.ast_node_type == :s_expression && !Enum.empty?(s_expr_node.children) do
first_child_id = hd(s_expr_node.children)
first_child_node = Map.get(nodes_map, first_child_id)
first_child_node && first_child_node.ast_node_type == :symbol &&
first_child_node.name == "fn"
else
false
end
end
# Helper to transform a generic S-expression node (known to be an 'fn' form)
# into a :lambda_expression node.
defp transform_to_lambda_expression(s_expr_node, nodes_map) do
# s_expr_node.children = [fn_symbol_id, params_s_expr_id, body_form1_id, ...]
_fn_symbol_id = Enum.at(s_expr_node.children, 0) # Already checked
if length(s_expr_node.children) < 2 do
%{s_expr_node | parsing_error: "Malformed 'fn' expression: missing parameters list."}
else
params_s_expr_id = Enum.at(s_expr_node.children, 1)
params_s_expr_node = Map.get(nodes_map, params_s_expr_id)
if !(params_s_expr_node && params_s_expr_node.ast_node_type == :s_expression) do
Map.put(s_expr_node, :parsing_error, "Malformed 'fn' expression: parameters list is not an S-expression.")
else
# Children of the parameters S-expression, e.g. for (fn ((a integer) (b atom) atom) ...),
# param_s_expr_children_ids would be IDs of [(a integer), (b atom), atom]
all_param_children_ids = Map.get(params_s_expr_node, :children, [])
{arg_spec_node_ids, return_type_spec_node_id} =
if Enum.empty?(all_param_children_ids) do
# Case: (fn () body) -> No args, nil (inferred) return type spec
{[], nil}
else
# Case: (fn (arg1 type1 ... ret_type) body)
# Last element is return type spec, rest are arg specs.
args = Enum.take(all_param_children_ids, length(all_param_children_ids) - 1)
ret_type_id = List.last(all_param_children_ids)
{args, ret_type_id}
end
# Validate arg_spec_node_ids: each must be a symbol or an S-expr (param_symbol type_spec)
all_arg_specs_valid =
Enum.all?(arg_spec_node_ids, fn arg_id ->
arg_node = Map.get(nodes_map, arg_id)
case arg_node do
%{ast_node_type: :symbol} -> true # e.g. x
%{ast_node_type: :s_expression, children: s_children} -> # e.g. (x integer)
if length(s_children) == 2 do
param_sym_node = Map.get(nodes_map, hd(s_children))
type_spec_node = Map.get(nodes_map, hd(tl(s_children)))
param_sym_node && param_sym_node.ast_node_type == :symbol &&
type_spec_node && (type_spec_node.ast_node_type == :symbol || type_spec_node.ast_node_type == :s_expression)
else
false # Not a valid (param_symbol type_spec) structure
end
_ -> false # Not a symbol or valid S-expression for arg spec
end
end)
# Validate return_type_spec_node_id: must be nil or a valid type specifier node
return_type_spec_valid =
if is_nil(return_type_spec_node_id) do
true # Inferred return type is valid
else
ret_type_node = Map.get(nodes_map, return_type_spec_node_id)
ret_type_node && (ret_type_node.ast_node_type == :symbol || ret_type_node.ast_node_type == :s_expression)
end
if all_arg_specs_valid && return_type_spec_valid do
body_node_ids = Enum.drop(s_expr_node.children, 2) # Body starts after 'fn' and params_s_expr
Map.merge(s_expr_node, %{
:ast_node_type => :lambda_expression,
:params_s_expr_id => params_s_expr_id,
:arg_spec_node_ids => arg_spec_node_ids,
:return_type_spec_node_id => return_type_spec_node_id,
:body_node_ids => body_node_ids
})
else
# Determine more specific error message
error_message =
cond do
!all_arg_specs_valid -> "Malformed 'fn' expression: invalid argument specification(s)."
!return_type_spec_valid -> "Malformed 'fn' expression: invalid return type specification."
true -> "Malformed 'fn' expression." # Generic fallback
end
Map.put(s_expr_node, :parsing_error, error_message)
end
end
end
end
defp parse_list_expression(original_source_string, source, state, parent_id) do
parse_collection(
original_source_string,
source,
state,
parent_id,
"[",
"]",
:list_expression,
"Unclosed list",
"Error parsing element in list. Content might be incomplete."
)
end
defp parse_map_expression(original_source_string, source, state, parent_id) do
parse_collection(
original_source_string,
source,
state,
parent_id,
# Opening token
"m{",
# Closing token
"}",
:map_expression,
"Unclosed map",
"Error parsing element in map. Content might be incomplete."
)
end
defp parse_tuple_expression(original_source_string, source, state, parent_id) do
parse_collection(
original_source_string,
source,
state,
parent_id,
"{",
"}",
:tuple_expression,
"Unclosed tuple",
"Error parsing element in tuple. Content might be incomplete."
)
end
defp parse_collection(
original_source_string,
source,
state,
parent_id,
open_char_str,
# Used by parse_collection_elements
close_char_str,
ast_node_type,
# Used by parse_collection_elements
unclosed_error_msg,
# Used by parse_collection_elements
element_error_msg
) do
# Consume opening token (e.g. '(', '[', 'm{')
collection_start_offset = state.offset
collection_start_line = state.line
collection_start_col = state.col
open_char_len = String.length(open_char_str)
{_opening_token, rest_after_opening_token} = String.split_at(source, open_char_len)
current_state = advance_pos(state, open_char_str)
collection_node_id = System.unique_integer([:monotonic, :positive])
prelim_collection_node = %{
id: collection_node_id,
type_id: nil,
parent_id: parent_id,
file: current_state.file_name,
# End TBD
location: [collection_start_offset, collection_start_line, collection_start_col, 0, 0, 0],
# TBD
raw_string: "",
ast_node_type: ast_node_type,
children: [],
parsing_error: nil
}
current_state_with_prelim_node = %{
current_state
| nodes: Map.put(current_state.nodes, collection_node_id, prelim_collection_node)
}
collection_start_pos_for_children =
{collection_start_offset, collection_start_line, collection_start_col}
# Pass all necessary params to the generalized element parser
result =
parse_collection_elements(
original_source_string,
rest_after_opening_token,
current_state_with_prelim_node,
collection_node_id,
[],
collection_start_pos_for_children,
# Parameters for generalization, passed from parse_collection's arguments:
# Used by parse_collection_elements
close_char_str,
# Used by parse_collection_elements
unclosed_error_msg,
# Passed to parse_collection_elements (might be unused there now)
element_error_msg
)
# Adapt result to {:ok, node_id, ...} or {:error_node, node_id, ...}
case result do
{:ok, returned_collection_node_id, rest, state_after_elements} ->
{:ok, returned_collection_node_id, rest, state_after_elements}
{:error, reason, rest, state_after_elements} ->
# The collection_node_id is the ID of the node that has the error.
# This 'reason' is typically for unclosed collections or fatal element errors.
{:error_node, collection_node_id, reason, rest, state_after_elements}
end
end
# Generalized from parse_s_expression_elements
defp parse_collection_elements(
original_source_string,
source,
state,
collection_node_id,
children_ids_acc,
collection_start_pos_tuple,
# New parameters for generalization:
# e.g., ")" or "]"
closing_char_str,
# e.g., "Unclosed S-expression"
unclosed_error_message,
# e.g., "Error parsing element in S-expression..."
# Now potentially unused, marked with underscore
element_error_message
) do
case skip_whitespace(source, state) do
{:eos, current_state_at_eos} ->
# Unclosed collection
collection_node = Map.get(current_state_at_eos.nodes, collection_node_id)
start_offset = elem(collection_start_pos_tuple, 0)
end_offset = current_state_at_eos.offset
actual_raw_string =
String.slice(original_source_string, start_offset, end_offset - start_offset)
updated_collection_node = %{
collection_node
| # Use generalized message
parsing_error: unclosed_error_message,
children: Enum.reverse(children_ids_acc),
location: [
start_offset,
elem(collection_start_pos_tuple, 1),
elem(collection_start_pos_tuple, 2),
end_offset,
current_state_at_eos.line,
current_state_at_eos.col
],
raw_string: actual_raw_string
}
final_state = %{
current_state_at_eos
| nodes:
Map.put(current_state_at_eos.nodes, collection_node_id, updated_collection_node)
}
# This error is for the collection itself being unclosed.
# The collection_node_id is implicitly the ID of this error node.
{:error, unclosed_error_message, "", final_state}
{:ok, remaining_source, current_state} ->
# Check if the remaining source starts with the closing token string
if String.starts_with?(remaining_source, closing_char_str) do
# End of collection
closing_char_len = String.length(closing_char_str)
{_closing_token, rest_after_closing_token} =
String.split_at(remaining_source, closing_char_len)
final_collection_state = advance_pos(current_state, closing_char_str)
collection_node = Map.get(final_collection_state.nodes, collection_node_id)
coll_final_start_offset = elem(collection_start_pos_tuple, 0)
coll_final_start_line = elem(collection_start_pos_tuple, 1)
coll_final_start_col = elem(collection_start_pos_tuple, 2)
coll_final_end_offset = final_collection_state.offset
coll_final_end_line = final_collection_state.line
coll_final_end_col = final_collection_state.col
actual_raw_string =
String.slice(
original_source_string,
coll_final_start_offset,
coll_final_end_offset - coll_final_start_offset
)
updated_collection_node = %{
collection_node
| children: Enum.reverse(children_ids_acc),
location: [
coll_final_start_offset,
coll_final_start_line,
coll_final_start_col,
coll_final_end_offset,
coll_final_end_line,
coll_final_end_col
],
raw_string: actual_raw_string
}
final_state_with_collection = %{
final_collection_state
| nodes:
Map.put(
final_collection_state.nodes,
collection_node_id,
updated_collection_node
)
}
{:ok, collection_node_id, rest_after_closing_token, final_state_with_collection}
else
# Parse an element
case parse_datum(
original_source_string,
remaining_source,
current_state,
# parent_id for the element
collection_node_id
) do
{:ok, child_node_id, next_source_after_elem, next_state_after_elem} ->
parse_collection_elements(
original_source_string,
next_source_after_elem,
next_state_after_elem,
collection_node_id,
# Add successful child's ID
[child_node_id | children_ids_acc],
collection_start_pos_tuple,
closing_char_str,
unclosed_error_message,
# Pass through, though may be unused
element_error_message
)
{:error_node, child_error_node_id, _child_reason, next_source_after_elem,
next_state_after_elem} ->
# An error node was created for the child element. Add its ID and continue.
parse_collection_elements(
original_source_string,
next_source_after_elem,
next_state_after_elem,
collection_node_id,
# Add error child's ID
[child_error_node_id | children_ids_acc],
collection_start_pos_tuple,
closing_char_str,
unclosed_error_message,
# Pass through
element_error_message
)
# No other return types are expected from parse_datum if it always creates a node on error
# or succeeds. If parse_datum could fail without creating a node and without consuming input,
# that would be an issue here, potentially leading to infinite loops if not handled.
# The current changes aim for parse_datum to always return :ok or :error_node.
end
end
end
end
# --- Utility Functions ---
# Note: The `extra_fields` argument was changed from optional to required
# as the default value was never used according to compiler warnings.
defp add_node(state, parent_id, location, raw_string, ast_node_type, extra_fields) do
node_id = System.unique_integer([:monotonic, :positive])
node =
%{
id: node_id,
type_id: nil,
parent_id: parent_id,
file: state.file_name,
# [start_offset, start_line, start_col, end_offset, end_line, end_col]
location: location,
raw_string: raw_string,
ast_node_type: ast_node_type
}
|> Map.merge(extra_fields)
{node_id, %{state | nodes: Map.put(state.nodes, node_id, node)}}
end
defp skip_whitespace(source, state = %__MODULE__{offset: o, line: l, col: c}) do
whitespace_match = Regex.run(~r/^\s+/, source)
if whitespace_match do
[ws | _] = whitespace_match
new_offset = o + String.length(ws)
{new_line, new_col} = calculate_new_line_col(ws, l, c)
remaining_source = String.slice(source, String.length(ws)..-1)
{:ok, remaining_source, %{state | offset: new_offset, line: new_line, col: new_col}}
else
if String.length(source) == 0 do
{:eos, state}
else
# No leading whitespace
{:ok, source, state}
end
end
end
defp calculate_new_line_col(string_segment, start_line, start_col) do
string_segment
|> String.codepoints()
|> Enum.reduce({start_line, start_col}, fn char, {line, col} ->
if char == "\n" do
{line + 1, 1}
else
{line, col + 1}
end
end)
end
defp advance_pos(state = %__MODULE__{offset: o, line: l, col: c}, consumed_string) do
new_offset = o + String.length(consumed_string)
{new_line, new_col} = calculate_new_line_col(consumed_string, l, c)
%{state | offset: new_offset, line: new_line, col: new_col}
end
end