checkpoint failing test after fixing tests checkpoint checkpoint checkpoint re-work asd checkpoint checkpoint checkpoint mix proj checkpoint mix first parser impl checkpoint fix tests re-org parser checkpoint strings fix multiline strings tuples checkpoint maps checkpoint checkpoint checkpoint checkpoint fix weird eof expression parse error checkpoint before typing checkpoint checpoint checkpoint checkpoint checkpoint ids in primitive types checkpoint checkpoint fix tests initial annotation checkpoint checkpoint checkpoint union subtyping conventions refactor - split typer typing tuples checkpoint test refactor checkpoint test refactor parsing atoms checkpoint atoms wip lists checkpoint typing lists checkopint checkpoint wip fixing correct list typing map discussion checkpoint map basic typing fix tests checkpoint checkpoint checkpoint checkpoint fix condition typing fix literal keys in map types checkpoint union types checkpoint union type checkpoint row types discussion & bidirectional typecheck checkpoint basic lambdas checkpoint lambdas typing application wip function application checkpoint checkpoint checkpoint cduce checkpoint checkpoint checkpoint checkpoint checkpoint checkpoint checkpoint
964 lines
34 KiB
Elixir
964 lines
34 KiB
Elixir
defmodule Til.Parser do
|
|
@moduledoc """
|
|
Parser for the Tilly Lisp dialect.
|
|
It transforms source code into a collection of Node Maps.
|
|
"""
|
|
|
|
# Represents the current parsing position
|
|
defstruct offset: 0, line: 1, col: 1, file_name: "unknown", nodes: %{}
|
|
|
|
@doc """
|
|
Parses a source string into a map of AST nodes.
|
|
"""
|
|
def parse(source_string, file_name \\ "unknown") do
|
|
file_node_id = System.unique_integer([:monotonic, :positive])
|
|
|
|
# Initial location for the file node (starts at the beginning)
|
|
file_start_offset = 0
|
|
file_start_line = 1
|
|
file_start_col = 1
|
|
|
|
# End location and raw_string will be finalized after parsing all content
|
|
prelim_file_node = %{
|
|
id: file_node_id,
|
|
type_id: nil,
|
|
# File node is the root
|
|
parent_id: nil,
|
|
file: file_name,
|
|
# End TBD
|
|
location: [file_start_offset, file_start_line, file_start_col, 0, 0, 0],
|
|
# TBD
|
|
raw_string: "",
|
|
ast_node_type: :file,
|
|
# TBD
|
|
children: [],
|
|
parsing_error: nil
|
|
}
|
|
|
|
initial_state = %__MODULE__{
|
|
file_name: file_name,
|
|
# Add prelim file node
|
|
nodes: %{file_node_id => prelim_file_node},
|
|
# Initial state offset should be 0 for the file
|
|
offset: 0,
|
|
# Initial state line should be 1
|
|
line: 1,
|
|
# Initial state col should be 1
|
|
col: 1
|
|
}
|
|
|
|
# Pass original_source_string for raw_string extraction, and file_node_id as parent for top-level exprs
|
|
final_state_after_expressions =
|
|
parse_all_expressions(source_string, source_string, initial_state, file_node_id)
|
|
|
|
# Finalize the file node
|
|
# Calculate end position of the entire source string
|
|
{file_end_line, file_end_col} = calculate_new_line_col(source_string, 1, 1)
|
|
# Offset is 0-indexed, length is the count of characters, so end_offset is length.
|
|
file_end_offset = String.length(source_string)
|
|
|
|
# Collect children of the file node
|
|
file_children_ids =
|
|
final_state_after_expressions.nodes
|
|
|> Map.values()
|
|
|> Enum.filter(&(&1.parent_id == file_node_id))
|
|
# Sort by start offset to maintain order of appearance in the source
|
|
|> Enum.sort_by(fn node -> hd(node.location) end)
|
|
|> Enum.map(& &1.id)
|
|
|
|
updated_file_node =
|
|
final_state_after_expressions.nodes
|
|
|> Map.get(file_node_id)
|
|
|> Map.merge(%{
|
|
location: [
|
|
file_start_offset,
|
|
file_start_line,
|
|
file_start_col,
|
|
file_end_offset,
|
|
file_end_line,
|
|
file_end_col
|
|
],
|
|
# The entire source is the raw string of the file node
|
|
raw_string: source_string,
|
|
children: file_children_ids
|
|
})
|
|
|
|
final_nodes =
|
|
Map.put(final_state_after_expressions.nodes, file_node_id, updated_file_node)
|
|
|
|
{:ok, final_nodes}
|
|
end
|
|
|
|
# --- Main Parsing Logic ---
|
|
|
|
# original_source_string is the complete initial source, source_string is the current remainder
|
|
# parent_id_for_top_level_expressions is the ID of the node that top-level expressions should be parented to (e.g., the :file node)
|
|
defp parse_all_expressions(
|
|
original_source_string,
|
|
source_string,
|
|
state,
|
|
parent_id_for_top_level_expressions
|
|
) do
|
|
case skip_whitespace(source_string, state) do
|
|
{:eos, final_state} ->
|
|
final_state
|
|
|
|
{:ok, remaining_source, current_state} ->
|
|
if remaining_source == "" do
|
|
# All content parsed, nothing left after skipping whitespace.
|
|
# This is a successful termination of parsing for the current branch.
|
|
current_state
|
|
else
|
|
# There's actual content to parse.
|
|
case parse_datum(
|
|
original_source_string,
|
|
remaining_source,
|
|
current_state,
|
|
parent_id_for_top_level_expressions
|
|
) do
|
|
{:ok, _node_id, next_source, next_state} ->
|
|
parse_all_expressions(
|
|
original_source_string,
|
|
next_source,
|
|
next_state,
|
|
parent_id_for_top_level_expressions
|
|
)
|
|
|
|
{:error_node, _node_id, _reason, next_source, next_state} ->
|
|
# An error node was created by parse_datum.
|
|
# Input was consumed. Continue parsing from next_source.
|
|
parse_all_expressions(
|
|
original_source_string,
|
|
next_source,
|
|
next_state,
|
|
parent_id_for_top_level_expressions
|
|
)
|
|
|
|
# NOTE: This relies on parse_datum and its components (like create_error_node_and_advance)
|
|
# to always consume input if source_string is not empty. If parse_datum could return
|
|
# :error_node without consuming input on a non-empty string, an infinite loop is possible.
|
|
# Current implementation of create_error_node_and_advance consumes 1 char.
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
# Parses a single datum: an atom (integer, symbol) or a list.
|
|
defp parse_datum(original_source_string, source, state, parent_id) do
|
|
# Peek for multi-character tokens first
|
|
cond do
|
|
String.starts_with?(source, "m{") ->
|
|
# Returns {:ok | :error_node, ...}
|
|
parse_map_expression(original_source_string, source, state, parent_id)
|
|
|
|
# Fallback to single character dispatch
|
|
true ->
|
|
char = String.first(source)
|
|
|
|
cond do
|
|
char == "(" ->
|
|
# Returns {:ok | :error_node, ...}
|
|
parse_s_expression(original_source_string, source, state, parent_id)
|
|
|
|
char == ")" ->
|
|
# Unexpected closing parenthesis, consume 1 char for the error token ')'
|
|
# Returns {:error_node, ...}
|
|
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected ')'")
|
|
|
|
char == "[" ->
|
|
# Returns {:ok | :error_node, ...}
|
|
parse_list_expression(original_source_string, source, state, parent_id)
|
|
|
|
char == "]" ->
|
|
# Unexpected closing square bracket, consume 1 char for the error token ']'
|
|
# Returns {:error_node, ...}
|
|
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected ']'")
|
|
|
|
# For tuples
|
|
char == "{" ->
|
|
# Returns {:ok | :error_node, ...}
|
|
parse_tuple_expression(original_source_string, source, state, parent_id)
|
|
|
|
char == "}" ->
|
|
# Unexpected closing curly brace
|
|
# Returns {:error_node, ...}
|
|
create_error_node_and_advance(source, state, parent_id, 1, "Unexpected '}'")
|
|
|
|
char == "'" ->
|
|
# Returns {:ok | :error_node, ...}
|
|
parse_string_datum(original_source_string, source, state, parent_id)
|
|
|
|
char == ":" ->
|
|
# If the first char is ':', try to parse as an atom like :foo
|
|
case parse_atom_datum(source, state, parent_id) do
|
|
{:ok, node_id, rest, new_state} ->
|
|
{:ok, node_id, rest, new_state}
|
|
{:error, :not_atom} ->
|
|
# Failed to parse as a specific atom (e.g. ":foo").
|
|
# It could be a symbol that starts with ':' (e.g. if we allow ":" as a symbol).
|
|
# Fallback to general symbol parsing. Integer parsing won't match if it starts with ':'.
|
|
case parse_symbol_datum(source, state, parent_id) do
|
|
{:ok, node_id, rest, new_state} ->
|
|
{:ok, node_id, rest, new_state}
|
|
{:error, :not_symbol} ->
|
|
# If it started with ':' but wasn't a valid atom and also not a valid symbol
|
|
create_error_node_and_advance(source, state, parent_id, 1, "Unknown token starting with ':'")
|
|
end
|
|
end
|
|
|
|
true ->
|
|
# Default case for other characters
|
|
# Try parsing as an integer first
|
|
case parse_integer_datum(source, state, parent_id) do
|
|
{:ok, node_id, rest, new_state} ->
|
|
{:ok, node_id, rest, new_state}
|
|
{:error, :not_integer} ->
|
|
# Not an integer, try parsing as a symbol
|
|
case parse_symbol_datum(source, state, parent_id) do
|
|
{:ok, node_id, rest, new_state} ->
|
|
{:ok, node_id, rest, new_state}
|
|
{:error, :not_symbol} ->
|
|
# Not a symbol either. Consume 1 char for the unknown token.
|
|
create_error_node_and_advance(source, state, parent_id, 1, "Unknown token")
|
|
end
|
|
end
|
|
end # end inner cond
|
|
end # end outer cond
|
|
end
|
|
|
|
# --- Datum Parsing Helpers --- (parse_string_datum, process_string_content)
|
|
|
|
defp parse_string_datum(_original_source_string, source, state, parent_id) do
|
|
# state is before consuming "'"
|
|
initial_state_for_token = state
|
|
strip_indent = initial_state_for_token.col - 1
|
|
|
|
# Consume opening "'"
|
|
{opening_tick, source_after_opening_tick} = String.split_at(source, 1)
|
|
|
|
case :binary.match(source_after_opening_tick, "'") do
|
|
:nomatch ->
|
|
# Unclosed string
|
|
content_segment = source_after_opening_tick
|
|
raw_token = opening_tick <> content_segment
|
|
|
|
state_at_node_end = advance_pos(initial_state_for_token, raw_token)
|
|
|
|
location = [
|
|
initial_state_for_token.offset,
|
|
initial_state_for_token.line,
|
|
initial_state_for_token.col,
|
|
state_at_node_end.offset,
|
|
state_at_node_end.line,
|
|
state_at_node_end.col
|
|
]
|
|
|
|
processed_value = process_string_content(content_segment, strip_indent)
|
|
|
|
{node_id, state_with_error_node} =
|
|
add_node(
|
|
initial_state_for_token,
|
|
parent_id,
|
|
location,
|
|
raw_token,
|
|
:literal_string,
|
|
%{value: processed_value, parsing_error: "Unclosed string literal"}
|
|
)
|
|
|
|
final_state = %{
|
|
state_with_error_node
|
|
| offset: state_at_node_end.offset,
|
|
line: state_at_node_end.line,
|
|
col: state_at_node_end.col
|
|
}
|
|
|
|
{:error_node, node_id, "Unclosed string literal", "", final_state}
|
|
|
|
# _tick_length will be 1 for "`"
|
|
{idx_closing_tick_in_segment, _tick_length} ->
|
|
content_segment =
|
|
String.slice(source_after_opening_tick, 0, idx_closing_tick_in_segment)
|
|
|
|
closing_tick = "'"
|
|
raw_token = opening_tick <> content_segment <> closing_tick
|
|
|
|
rest_of_source =
|
|
String.slice(source_after_opening_tick, (idx_closing_tick_in_segment + 1)..-1)
|
|
|
|
state_at_node_end = advance_pos(initial_state_for_token, raw_token)
|
|
|
|
location = [
|
|
initial_state_for_token.offset,
|
|
initial_state_for_token.line,
|
|
initial_state_for_token.col,
|
|
state_at_node_end.offset,
|
|
state_at_node_end.line,
|
|
state_at_node_end.col
|
|
]
|
|
|
|
processed_value = process_string_content(content_segment, strip_indent)
|
|
|
|
{new_node_id, state_with_node} =
|
|
add_node(
|
|
initial_state_for_token,
|
|
parent_id,
|
|
location,
|
|
raw_token,
|
|
:literal_string,
|
|
%{value: processed_value}
|
|
)
|
|
|
|
final_state = %{
|
|
state_with_node
|
|
| offset: state_at_node_end.offset,
|
|
line: state_at_node_end.line,
|
|
col: state_at_node_end.col
|
|
}
|
|
|
|
{:ok, new_node_id, rest_of_source, final_state}
|
|
end
|
|
end
|
|
|
|
defp process_string_content(content_str, strip_indent) when strip_indent >= 0 do
|
|
lines = String.split(content_str, "\n", trim: false)
|
|
# Will always exist, even for empty content_str -> ""
|
|
first_line = List.first(lines)
|
|
|
|
rest_lines =
|
|
if length(lines) > 1 do
|
|
List.delete_at(lines, 0)
|
|
else
|
|
[]
|
|
end
|
|
|
|
processed_rest_lines =
|
|
Enum.map(rest_lines, fn line ->
|
|
current_leading_spaces_count =
|
|
Regex.run(~r/^(\s*)/, line)
|
|
|> List.first()
|
|
|> String.length()
|
|
|
|
spaces_to_remove = min(current_leading_spaces_count, strip_indent)
|
|
String.slice(line, spaces_to_remove..-1)
|
|
end)
|
|
|
|
all_processed_lines = [first_line | processed_rest_lines]
|
|
Enum.join(all_processed_lines, "\n")
|
|
end
|
|
|
|
# --- Datum Parsing Helpers --- (parse_string_datum, process_string_content)
|
|
|
|
# (parse_string_datum remains unchanged)
|
|
|
|
defp parse_atom_datum(source, state, parent_id) do
|
|
# Atom is a colon followed by one or more non-delimiter characters.
|
|
# Delimiters are whitespace, (, ), [, ], {, }.
|
|
# The colon itself is part of the atom's raw string.
|
|
# The `atom_name_part` is what comes after the colon.
|
|
case Regex.run(~r/^:([^\s\(\)\[\]\{\}]+)/, source) do
|
|
[raw_atom_str, atom_name_part] -> # raw_atom_str is like ":foo", atom_name_part is "foo"
|
|
# The regex [^...]+ ensures atom_name_part is not empty.
|
|
rest_after_atom = String.slice(source, String.length(raw_atom_str)..-1)
|
|
start_offset = state.offset
|
|
start_line = state.line
|
|
start_col = state.col
|
|
state_after_token = advance_pos(state, raw_atom_str)
|
|
end_offset = state_after_token.offset
|
|
end_line = state_after_token.line
|
|
end_col = state_after_token.col
|
|
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
|
|
|
|
# Convert the name part (e.g., "foo") to an Elixir atom (e.g., :foo)
|
|
atom_value = String.to_atom(atom_name_part)
|
|
|
|
{new_node_id, state_with_node} =
|
|
add_node(
|
|
state,
|
|
parent_id,
|
|
location,
|
|
raw_atom_str,
|
|
:literal_atom,
|
|
%{value: atom_value}
|
|
)
|
|
|
|
final_state = %{
|
|
state_with_node
|
|
| offset: end_offset,
|
|
line: end_line,
|
|
col: end_col
|
|
}
|
|
{:ok, new_node_id, rest_after_atom, final_state}
|
|
|
|
_ -> # No match (nil or list that doesn't conform, e.g., just ":" or ": followed by space/delimiter")
|
|
{:error, :not_atom}
|
|
end
|
|
end
|
|
|
|
defp parse_integer_datum(source, state, parent_id) do
|
|
case Integer.parse(source) do
|
|
{int_val, rest_after_int} ->
|
|
raw_int =
|
|
String.slice(source, 0, String.length(source) - String.length(rest_after_int))
|
|
|
|
start_offset = state.offset
|
|
start_line = state.line
|
|
start_col = state.col
|
|
state_after_token = advance_pos(state, raw_int)
|
|
end_offset = state_after_token.offset
|
|
end_line = state_after_token.line
|
|
end_col = state_after_token.col
|
|
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
|
|
|
|
{new_node_id, state_with_node} =
|
|
add_node(state, parent_id, location, raw_int, :literal_integer, %{value: int_val})
|
|
|
|
# Update state to reflect consumed token
|
|
final_state = %{state_with_node | offset: end_offset, line: end_line, col: end_col}
|
|
{:ok, new_node_id, rest_after_int, final_state}
|
|
|
|
:error ->
|
|
# Indicates failure, source and state are unchanged by this attempt
|
|
{:error, :not_integer}
|
|
end
|
|
end
|
|
|
|
defp parse_symbol_datum(source, state, parent_id) do
|
|
# Regex excludes common delimiters. `m{` is handled before symbol parsing.
|
|
case Regex.run(~r/^([^\s\(\)\[\]\{\}]+)/, source) do
|
|
[raw_symbol | _] ->
|
|
rest_after_symbol = String.slice(source, String.length(raw_symbol)..-1)
|
|
start_offset = state.offset
|
|
start_line = state.line
|
|
start_col = state.col
|
|
state_after_token = advance_pos(state, raw_symbol)
|
|
end_offset = state_after_token.offset
|
|
end_line = state_after_token.line
|
|
end_col = state_after_token.col
|
|
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
|
|
|
|
{new_node_id, state_with_node} =
|
|
add_node(state, parent_id, location, raw_symbol, :symbol, %{name: raw_symbol})
|
|
|
|
# Update state to reflect consumed token
|
|
final_state = %{
|
|
state_with_node
|
|
| offset: end_offset,
|
|
line: end_line,
|
|
col: end_col
|
|
}
|
|
|
|
{:ok, new_node_id, rest_after_symbol, final_state}
|
|
|
|
nil ->
|
|
# Indicates failure, source and state are unchanged by this attempt
|
|
{:error, :not_symbol}
|
|
end
|
|
end
|
|
|
|
defp create_error_node_and_advance(
|
|
source_for_token,
|
|
state_before_token,
|
|
parent_id,
|
|
num_chars_for_token,
|
|
error_message
|
|
) do
|
|
{raw_token, rest_of_source} = String.split_at(source_for_token, num_chars_for_token)
|
|
|
|
start_offset = state_before_token.offset
|
|
start_line = state_before_token.line
|
|
start_col = state_before_token.col
|
|
|
|
state_after_token_consumed = advance_pos(state_before_token, raw_token)
|
|
end_offset = state_after_token_consumed.offset
|
|
end_line = state_after_token_consumed.line
|
|
end_col = state_after_token_consumed.col
|
|
location = [start_offset, start_line, start_col, end_offset, end_line, end_col]
|
|
|
|
{error_node_id, state_with_error_node} =
|
|
add_node(state_before_token, parent_id, location, raw_token, :unknown, %{
|
|
parsing_error: error_message
|
|
})
|
|
|
|
# The state for further parsing must reflect the consumed token's position and include the new error node
|
|
final_error_state = %{
|
|
state_with_error_node
|
|
| offset: end_offset,
|
|
line: end_line,
|
|
col: end_col
|
|
}
|
|
|
|
{:error_node, error_node_id, error_message, rest_of_source, final_error_state}
|
|
end
|
|
|
|
defp parse_s_expression(original_source_string, source, state, parent_id) do
|
|
# Standard S-expression parsing via parse_collection
|
|
result = parse_collection(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
parent_id,
|
|
"(",
|
|
")",
|
|
:s_expression,
|
|
"Unclosed S-expression",
|
|
"Error parsing element in S-expression. Content might be incomplete."
|
|
)
|
|
|
|
# After parsing, check if it's an 'fn' expression
|
|
case result do
|
|
{:ok, collection_node_id, rest_after_collection, state_after_collection} ->
|
|
collection_node = Map.get(state_after_collection.nodes, collection_node_id)
|
|
|
|
if is_fn_expression?(collection_node, state_after_collection.nodes) do
|
|
transformed_node =
|
|
transform_to_lambda_expression(collection_node, state_after_collection.nodes)
|
|
|
|
final_state = %{
|
|
state_after_collection
|
|
| nodes:
|
|
Map.put(state_after_collection.nodes, transformed_node.id, transformed_node)
|
|
}
|
|
|
|
{:ok, transformed_node.id, rest_after_collection, final_state}
|
|
else
|
|
# Not an fn expression, return as is
|
|
result
|
|
end
|
|
|
|
_error_or_other ->
|
|
# Propagate errors or other results from parse_collection
|
|
result
|
|
end
|
|
end
|
|
|
|
# Helper to check if an S-expression node is an 'fn' expression
|
|
defp is_fn_expression?(s_expr_node, nodes_map) do
|
|
if s_expr_node.ast_node_type == :s_expression && !Enum.empty?(s_expr_node.children) do
|
|
first_child_id = hd(s_expr_node.children)
|
|
first_child_node = Map.get(nodes_map, first_child_id)
|
|
|
|
first_child_node && first_child_node.ast_node_type == :symbol &&
|
|
first_child_node.name == "fn"
|
|
else
|
|
false
|
|
end
|
|
end
|
|
|
|
# Helper to transform a generic S-expression node (known to be an 'fn' form)
|
|
# into a :lambda_expression node.
|
|
defp transform_to_lambda_expression(s_expr_node, nodes_map) do
|
|
# s_expr_node.children = [fn_symbol_id, params_s_expr_id, body_form1_id, ...]
|
|
_fn_symbol_id = Enum.at(s_expr_node.children, 0) # Already checked
|
|
|
|
if length(s_expr_node.children) < 2 do
|
|
%{s_expr_node | parsing_error: "Malformed 'fn' expression: missing parameters list."}
|
|
else
|
|
params_s_expr_id = Enum.at(s_expr_node.children, 1)
|
|
params_s_expr_node = Map.get(nodes_map, params_s_expr_id)
|
|
|
|
if !(params_s_expr_node && params_s_expr_node.ast_node_type == :s_expression) do
|
|
Map.put(s_expr_node, :parsing_error, "Malformed 'fn' expression: parameters list is not an S-expression.")
|
|
else
|
|
# Children of the parameters S-expression, e.g. for (fn ((a integer) (b atom) atom) ...),
|
|
# param_s_expr_children_ids would be IDs of [(a integer), (b atom), atom]
|
|
all_param_children_ids = Map.get(params_s_expr_node, :children, [])
|
|
|
|
{arg_spec_node_ids, return_type_spec_node_id} =
|
|
if Enum.empty?(all_param_children_ids) do
|
|
# Case: (fn () body) -> No args, nil (inferred) return type spec
|
|
{[], nil}
|
|
else
|
|
# Case: (fn (arg1 type1 ... ret_type) body)
|
|
# Last element is return type spec, rest are arg specs.
|
|
args = Enum.take(all_param_children_ids, length(all_param_children_ids) - 1)
|
|
ret_type_id = List.last(all_param_children_ids)
|
|
{args, ret_type_id}
|
|
end
|
|
|
|
# Validate arg_spec_node_ids: each must be a symbol or an S-expr (param_symbol type_spec)
|
|
all_arg_specs_valid =
|
|
Enum.all?(arg_spec_node_ids, fn arg_id ->
|
|
arg_node = Map.get(nodes_map, arg_id)
|
|
case arg_node do
|
|
%{ast_node_type: :symbol} -> true # e.g. x
|
|
%{ast_node_type: :s_expression, children: s_children} -> # e.g. (x integer)
|
|
if length(s_children) == 2 do
|
|
param_sym_node = Map.get(nodes_map, hd(s_children))
|
|
type_spec_node = Map.get(nodes_map, hd(tl(s_children)))
|
|
|
|
param_sym_node && param_sym_node.ast_node_type == :symbol &&
|
|
type_spec_node && (type_spec_node.ast_node_type == :symbol || type_spec_node.ast_node_type == :s_expression)
|
|
else
|
|
false # Not a valid (param_symbol type_spec) structure
|
|
end
|
|
_ -> false # Not a symbol or valid S-expression for arg spec
|
|
end
|
|
end)
|
|
|
|
# Validate return_type_spec_node_id: must be nil or a valid type specifier node
|
|
return_type_spec_valid =
|
|
if is_nil(return_type_spec_node_id) do
|
|
true # Inferred return type is valid
|
|
else
|
|
ret_type_node = Map.get(nodes_map, return_type_spec_node_id)
|
|
ret_type_node && (ret_type_node.ast_node_type == :symbol || ret_type_node.ast_node_type == :s_expression)
|
|
end
|
|
|
|
if all_arg_specs_valid && return_type_spec_valid do
|
|
body_node_ids = Enum.drop(s_expr_node.children, 2) # Body starts after 'fn' and params_s_expr
|
|
Map.merge(s_expr_node, %{
|
|
:ast_node_type => :lambda_expression,
|
|
:params_s_expr_id => params_s_expr_id,
|
|
:arg_spec_node_ids => arg_spec_node_ids,
|
|
:return_type_spec_node_id => return_type_spec_node_id,
|
|
:body_node_ids => body_node_ids
|
|
})
|
|
else
|
|
# Determine more specific error message
|
|
error_message =
|
|
cond do
|
|
!all_arg_specs_valid -> "Malformed 'fn' expression: invalid argument specification(s)."
|
|
!return_type_spec_valid -> "Malformed 'fn' expression: invalid return type specification."
|
|
true -> "Malformed 'fn' expression." # Generic fallback
|
|
end
|
|
Map.put(s_expr_node, :parsing_error, error_message)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
defp parse_list_expression(original_source_string, source, state, parent_id) do
|
|
parse_collection(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
parent_id,
|
|
"[",
|
|
"]",
|
|
:list_expression,
|
|
"Unclosed list",
|
|
"Error parsing element in list. Content might be incomplete."
|
|
)
|
|
end
|
|
|
|
defp parse_map_expression(original_source_string, source, state, parent_id) do
|
|
parse_collection(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
parent_id,
|
|
# Opening token
|
|
"m{",
|
|
# Closing token
|
|
"}",
|
|
:map_expression,
|
|
"Unclosed map",
|
|
"Error parsing element in map. Content might be incomplete."
|
|
)
|
|
end
|
|
|
|
defp parse_tuple_expression(original_source_string, source, state, parent_id) do
|
|
parse_collection(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
parent_id,
|
|
"{",
|
|
"}",
|
|
:tuple_expression,
|
|
"Unclosed tuple",
|
|
"Error parsing element in tuple. Content might be incomplete."
|
|
)
|
|
end
|
|
|
|
defp parse_collection(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
parent_id,
|
|
open_char_str,
|
|
# Used by parse_collection_elements
|
|
close_char_str,
|
|
ast_node_type,
|
|
# Used by parse_collection_elements
|
|
unclosed_error_msg,
|
|
# Used by parse_collection_elements
|
|
element_error_msg
|
|
) do
|
|
# Consume opening token (e.g. '(', '[', 'm{')
|
|
collection_start_offset = state.offset
|
|
collection_start_line = state.line
|
|
collection_start_col = state.col
|
|
open_char_len = String.length(open_char_str)
|
|
{_opening_token, rest_after_opening_token} = String.split_at(source, open_char_len)
|
|
current_state = advance_pos(state, open_char_str)
|
|
|
|
collection_node_id = System.unique_integer([:monotonic, :positive])
|
|
|
|
prelim_collection_node = %{
|
|
id: collection_node_id,
|
|
type_id: nil,
|
|
parent_id: parent_id,
|
|
file: current_state.file_name,
|
|
# End TBD
|
|
location: [collection_start_offset, collection_start_line, collection_start_col, 0, 0, 0],
|
|
# TBD
|
|
raw_string: "",
|
|
ast_node_type: ast_node_type,
|
|
children: [],
|
|
parsing_error: nil
|
|
}
|
|
|
|
current_state_with_prelim_node = %{
|
|
current_state
|
|
| nodes: Map.put(current_state.nodes, collection_node_id, prelim_collection_node)
|
|
}
|
|
|
|
collection_start_pos_for_children =
|
|
{collection_start_offset, collection_start_line, collection_start_col}
|
|
|
|
# Pass all necessary params to the generalized element parser
|
|
result =
|
|
parse_collection_elements(
|
|
original_source_string,
|
|
rest_after_opening_token,
|
|
current_state_with_prelim_node,
|
|
collection_node_id,
|
|
[],
|
|
collection_start_pos_for_children,
|
|
# Parameters for generalization, passed from parse_collection's arguments:
|
|
# Used by parse_collection_elements
|
|
close_char_str,
|
|
# Used by parse_collection_elements
|
|
unclosed_error_msg,
|
|
# Passed to parse_collection_elements (might be unused there now)
|
|
element_error_msg
|
|
)
|
|
|
|
# Adapt result to {:ok, node_id, ...} or {:error_node, node_id, ...}
|
|
case result do
|
|
{:ok, returned_collection_node_id, rest, state_after_elements} ->
|
|
{:ok, returned_collection_node_id, rest, state_after_elements}
|
|
|
|
{:error, reason, rest, state_after_elements} ->
|
|
# The collection_node_id is the ID of the node that has the error.
|
|
# This 'reason' is typically for unclosed collections or fatal element errors.
|
|
{:error_node, collection_node_id, reason, rest, state_after_elements}
|
|
end
|
|
end
|
|
|
|
# Generalized from parse_s_expression_elements
|
|
defp parse_collection_elements(
|
|
original_source_string,
|
|
source,
|
|
state,
|
|
collection_node_id,
|
|
children_ids_acc,
|
|
collection_start_pos_tuple,
|
|
# New parameters for generalization:
|
|
# e.g., ")" or "]"
|
|
closing_char_str,
|
|
# e.g., "Unclosed S-expression"
|
|
unclosed_error_message,
|
|
# e.g., "Error parsing element in S-expression..."
|
|
# Now potentially unused, marked with underscore
|
|
element_error_message
|
|
) do
|
|
case skip_whitespace(source, state) do
|
|
{:eos, current_state_at_eos} ->
|
|
# Unclosed collection
|
|
collection_node = Map.get(current_state_at_eos.nodes, collection_node_id)
|
|
start_offset = elem(collection_start_pos_tuple, 0)
|
|
end_offset = current_state_at_eos.offset
|
|
|
|
actual_raw_string =
|
|
String.slice(original_source_string, start_offset, end_offset - start_offset)
|
|
|
|
updated_collection_node = %{
|
|
collection_node
|
|
| # Use generalized message
|
|
parsing_error: unclosed_error_message,
|
|
children: Enum.reverse(children_ids_acc),
|
|
location: [
|
|
start_offset,
|
|
elem(collection_start_pos_tuple, 1),
|
|
elem(collection_start_pos_tuple, 2),
|
|
end_offset,
|
|
current_state_at_eos.line,
|
|
current_state_at_eos.col
|
|
],
|
|
raw_string: actual_raw_string
|
|
}
|
|
|
|
final_state = %{
|
|
current_state_at_eos
|
|
| nodes:
|
|
Map.put(current_state_at_eos.nodes, collection_node_id, updated_collection_node)
|
|
}
|
|
|
|
# This error is for the collection itself being unclosed.
|
|
# The collection_node_id is implicitly the ID of this error node.
|
|
{:error, unclosed_error_message, "", final_state}
|
|
|
|
{:ok, remaining_source, current_state} ->
|
|
# Check if the remaining source starts with the closing token string
|
|
if String.starts_with?(remaining_source, closing_char_str) do
|
|
# End of collection
|
|
closing_char_len = String.length(closing_char_str)
|
|
|
|
{_closing_token, rest_after_closing_token} =
|
|
String.split_at(remaining_source, closing_char_len)
|
|
|
|
final_collection_state = advance_pos(current_state, closing_char_str)
|
|
collection_node = Map.get(final_collection_state.nodes, collection_node_id)
|
|
|
|
coll_final_start_offset = elem(collection_start_pos_tuple, 0)
|
|
coll_final_start_line = elem(collection_start_pos_tuple, 1)
|
|
coll_final_start_col = elem(collection_start_pos_tuple, 2)
|
|
coll_final_end_offset = final_collection_state.offset
|
|
coll_final_end_line = final_collection_state.line
|
|
coll_final_end_col = final_collection_state.col
|
|
|
|
actual_raw_string =
|
|
String.slice(
|
|
original_source_string,
|
|
coll_final_start_offset,
|
|
coll_final_end_offset - coll_final_start_offset
|
|
)
|
|
|
|
updated_collection_node = %{
|
|
collection_node
|
|
| children: Enum.reverse(children_ids_acc),
|
|
location: [
|
|
coll_final_start_offset,
|
|
coll_final_start_line,
|
|
coll_final_start_col,
|
|
coll_final_end_offset,
|
|
coll_final_end_line,
|
|
coll_final_end_col
|
|
],
|
|
raw_string: actual_raw_string
|
|
}
|
|
|
|
final_state_with_collection = %{
|
|
final_collection_state
|
|
| nodes:
|
|
Map.put(
|
|
final_collection_state.nodes,
|
|
collection_node_id,
|
|
updated_collection_node
|
|
)
|
|
}
|
|
|
|
{:ok, collection_node_id, rest_after_closing_token, final_state_with_collection}
|
|
else
|
|
# Parse an element
|
|
case parse_datum(
|
|
original_source_string,
|
|
remaining_source,
|
|
current_state,
|
|
# parent_id for the element
|
|
collection_node_id
|
|
) do
|
|
{:ok, child_node_id, next_source_after_elem, next_state_after_elem} ->
|
|
parse_collection_elements(
|
|
original_source_string,
|
|
next_source_after_elem,
|
|
next_state_after_elem,
|
|
collection_node_id,
|
|
# Add successful child's ID
|
|
[child_node_id | children_ids_acc],
|
|
collection_start_pos_tuple,
|
|
closing_char_str,
|
|
unclosed_error_message,
|
|
# Pass through, though may be unused
|
|
element_error_message
|
|
)
|
|
|
|
{:error_node, child_error_node_id, _child_reason, next_source_after_elem,
|
|
next_state_after_elem} ->
|
|
# An error node was created for the child element. Add its ID and continue.
|
|
parse_collection_elements(
|
|
original_source_string,
|
|
next_source_after_elem,
|
|
next_state_after_elem,
|
|
collection_node_id,
|
|
# Add error child's ID
|
|
[child_error_node_id | children_ids_acc],
|
|
collection_start_pos_tuple,
|
|
closing_char_str,
|
|
unclosed_error_message,
|
|
# Pass through
|
|
element_error_message
|
|
)
|
|
|
|
# No other return types are expected from parse_datum if it always creates a node on error
|
|
# or succeeds. If parse_datum could fail without creating a node and without consuming input,
|
|
# that would be an issue here, potentially leading to infinite loops if not handled.
|
|
# The current changes aim for parse_datum to always return :ok or :error_node.
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
# --- Utility Functions ---
|
|
|
|
# Note: The `extra_fields` argument was changed from optional to required
|
|
# as the default value was never used according to compiler warnings.
|
|
defp add_node(state, parent_id, location, raw_string, ast_node_type, extra_fields) do
|
|
node_id = System.unique_integer([:monotonic, :positive])
|
|
|
|
node =
|
|
%{
|
|
id: node_id,
|
|
type_id: nil,
|
|
parent_id: parent_id,
|
|
file: state.file_name,
|
|
# [start_offset, start_line, start_col, end_offset, end_line, end_col]
|
|
location: location,
|
|
raw_string: raw_string,
|
|
ast_node_type: ast_node_type
|
|
}
|
|
|> Map.merge(extra_fields)
|
|
|
|
{node_id, %{state | nodes: Map.put(state.nodes, node_id, node)}}
|
|
end
|
|
|
|
defp skip_whitespace(source, state = %__MODULE__{offset: o, line: l, col: c}) do
|
|
whitespace_match = Regex.run(~r/^\s+/, source)
|
|
|
|
if whitespace_match do
|
|
[ws | _] = whitespace_match
|
|
new_offset = o + String.length(ws)
|
|
{new_line, new_col} = calculate_new_line_col(ws, l, c)
|
|
remaining_source = String.slice(source, String.length(ws)..-1)
|
|
{:ok, remaining_source, %{state | offset: new_offset, line: new_line, col: new_col}}
|
|
else
|
|
if String.length(source) == 0 do
|
|
{:eos, state}
|
|
else
|
|
# No leading whitespace
|
|
{:ok, source, state}
|
|
end
|
|
end
|
|
end
|
|
|
|
defp calculate_new_line_col(string_segment, start_line, start_col) do
|
|
string_segment
|
|
|> String.codepoints()
|
|
|> Enum.reduce({start_line, start_col}, fn char, {line, col} ->
|
|
if char == "\n" do
|
|
{line + 1, 1}
|
|
else
|
|
{line, col + 1}
|
|
end
|
|
end)
|
|
end
|
|
|
|
defp advance_pos(state = %__MODULE__{offset: o, line: l, col: c}, consumed_string) do
|
|
new_offset = o + String.length(consumed_string)
|
|
{new_line, new_col} = calculate_new_line_col(consumed_string, l, c)
|
|
%{state | offset: new_offset, line: new_line, col: new_col}
|
|
end
|
|
end
|