diff options
Diffstat (limited to '')
| -rw-r--r-- | src/util/codes.rs | 126 | ||||
| -rw-r--r-- | src/util/encode.rs | 39 | ||||
| -rw-r--r-- | src/util/mod.rs | 1 | ||||
| -rw-r--r-- | src/util/normalize_identifier.rs | 2 | ||||
| -rw-r--r-- | src/util/span.rs | 57 | 
5 files changed, 161 insertions, 64 deletions
| diff --git a/src/util/codes.rs b/src/util/codes.rs new file mode 100644 index 0000000..8a46d02 --- /dev/null +++ b/src/util/codes.rs @@ -0,0 +1,126 @@ +//! Utilities to deal with character codes. + +use crate::constant::TAB_SIZE; +use crate::tokenizer::Code; + +/// Turn a string into codes. +pub fn parse(value: &str) -> Vec<Code> { +    let mut codes: Vec<Code> = vec![]; +    let mut at_start = true; +    let mut at_carriage_return = false; +    let mut column = 1; + +    for char in value.chars() { +        if at_start { +            if char == '\u{feff}' { +                // Ignore. +                continue; +            } + +            at_start = false; +        } + +        // Send a CRLF. +        if at_carriage_return && '\n' == char { +            at_carriage_return = false; +            codes.push(Code::CarriageReturnLineFeed); +        } else { +            // Send the previous CR: we’re not at a next `\n`. +            if at_carriage_return { +                at_carriage_return = false; +                codes.push(Code::Char('\r')); +            } + +            match char { +                // Send a replacement character. +                '\0' => { +                    column += 1; +                    codes.push(Code::Char('�')); +                } +                // Send a tab and virtual spaces. +                '\t' => { +                    let remainder = column % TAB_SIZE; +                    let mut virtual_spaces = if remainder == 0 { +                        0 +                    } else { +                        TAB_SIZE - remainder +                    }; +                    codes.push(Code::Char(char)); +                    column += 1; +                    while virtual_spaces > 0 { +                        codes.push(Code::VirtualSpace); +                        column += 1; +                        virtual_spaces -= 1; +                    } +                } +                // Send an LF. +                '\n' => { +                    column = 1; +                    codes.push(Code::Char(char)); +                } +                // Don’t send anything yet. +                '\r' => { +                    column = 1; +                    at_carriage_return = true; +                } +                // Send the char. +                _ => { +                    column += 1; +                    codes.push(Code::Char(char)); +                } +            } +        }; +    } + +    // Send the last CR: we’re not at a next `\n`. +    if at_carriage_return { +        codes.push(Code::Char('\r')); +    } + +    codes +} + +/// Serialize codes, optionally expanding tabs. +pub fn serialize(codes: &[Code], expand_tabs: bool) -> String { +    let mut at_tab = false; +    let mut index = 0; +    let mut value: Vec<char> = vec![]; + +    while index < codes.len() { +        let code = codes[index]; +        let mut at_tab_next = false; + +        match code { +            Code::CarriageReturnLineFeed => { +                value.push('\r'); +                value.push('\n'); +            } +            Code::Char(char) if char == '\n' || char == '\r' => { +                value.push(char); +            } +            Code::Char(char) if char == '\t' => { +                at_tab_next = true; +                value.push(if expand_tabs { ' ' } else { char }); +            } +            Code::VirtualSpace => { +                if !expand_tabs && at_tab { +                    index += 1; +                    continue; +                } +                value.push(' '); +            } +            Code::Char(char) => { +                value.push(char); +            } +            Code::None => { +                unreachable!("unexpected EOF code in codes"); +            } +        } + +        at_tab = at_tab_next; + +        index += 1; +    } + +    value.into_iter().collect() +} diff --git a/src/util/encode.rs b/src/util/encode.rs index 5762c22..a3bd589 100644 --- a/src/util/encode.rs +++ b/src/util/encode.rs @@ -21,11 +21,36 @@  ///  /// *   [`micromark-util-encode` in `micromark`](https://github.com/micromark/micromark/tree/main/packages/micromark-util-encode)  pub fn encode(value: &str) -> String { -    // To do: replacing 4 times might just be slow. -    // Perhaps we can walk the chars. -    value -        .replace('&', "&") -        .replace('"', """) -        .replace('<', "<") -        .replace('>', ">") +    let mut result: Vec<&str> = vec![]; +    let mut start = 0; +    let mut index = 0; + +    for byte in value.bytes() { +        if let Some(replacement) = match byte { +            b'&' => Some("&"), +            b'"' => Some("""), +            b'<' => Some("<"), +            b'>' => Some(">"), +            _ => None, +        } { +            if start != index { +                result.push(&value[start..index]); +            } + +            result.push(replacement); +            start = index + 1; +        } + +        index += 1; +    } + +    if start == 0 { +        value.to_string() +    } else { +        if start < index { +            result.push(&value[start..index]); +        } + +        result.join("") +    }  } diff --git a/src/util/mod.rs b/src/util/mod.rs index 68ef275..d1a0e01 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -1,5 +1,6 @@  //! Utilities used when compiling markdown. +pub mod codes;  pub mod decode_character_reference;  pub mod edit_map;  pub mod encode; diff --git a/src/util/normalize_identifier.rs b/src/util/normalize_identifier.rs index 4753f7b..123a3a9 100644 --- a/src/util/normalize_identifier.rs +++ b/src/util/normalize_identifier.rs @@ -39,7 +39,7 @@ pub fn normalize_identifier(value: &str) -> String {      // Collapse markdown whitespace and trim it.      for char in value.chars() {          match char { -            '\t' | '\r' | '\n' | ' ' => { +            '\t' | '\n' | '\r' | ' ' => {                  at_whitespace = true;              }              _ => { diff --git a/src/util/span.rs b/src/util/span.rs index 02811cc..32dd00f 100644 --- a/src/util/span.rs +++ b/src/util/span.rs @@ -1,20 +1,15 @@  //! Utilities to deal with semantic labels.  use crate::tokenizer::{Code, Event, EventType}; +use crate::util::codes::serialize as serialize_codes;  /// A struct representing the span of an opening and closing event of a token.  #[derive(Debug)]  pub struct Span { -    // To do: probably needed in the future. -    // start: Point,      /// Absolute offset (and `index` in `codes`) of where this span starts.      pub start_index: usize, -    // To do: probably needed in the future. -    // end: Point,      /// Absolute offset (and `index` in `codes`) of where this span ends.      pub end_index: usize, -    // To do: probably needed in the future. -    // token_type: TokenType,  }  /// Get a span from an event. @@ -29,10 +24,8 @@ pub struct Span {  /// When `micromark` is used, this function never panics.  pub fn from_exit_event(events: &[Event], index: usize) -> Span {      let exit = &events[index]; -    // let end = exit.point.clone();      let end_index = exit.index;      let token_type = exit.token_type.clone(); -    // To do: support `enter` events if needed and walk forwards?      assert_eq!(          exit.event_type,          EventType::Exit, @@ -44,11 +37,8 @@ pub fn from_exit_event(events: &[Event], index: usize) -> Span {          let enter = &events[enter_index];          if enter.event_type == EventType::Enter && enter.token_type == token_type {              return Span { -                // start: enter.point.clone(),                  start_index: enter.index, -                // end,                  end_index, -                // token_type,              };          } @@ -65,48 +55,3 @@ pub fn serialize(all_codes: &[Code], span: &Span, expand_tabs: bool) -> String {  pub fn codes<'a>(codes: &'a [Code], span: &Span) -> &'a [Code] {      &codes[span.start_index..span.end_index]  } - -/// Serialize a slice of codes, optionally expanding tabs. -fn serialize_codes(codes: &[Code], expand_tabs: bool) -> String { -    let mut at_tab = false; -    let mut index = 0; -    let mut value: Vec<char> = vec![]; - -    while index < codes.len() { -        let code = codes[index]; -        let mut at_tab_next = false; - -        match code { -            Code::CarriageReturnLineFeed => { -                value.push('\r'); -                value.push('\n'); -            } -            Code::Char(char) if char == '\n' || char == '\r' => { -                value.push(char); -            } -            Code::Char(char) if char == '\t' => { -                at_tab_next = true; -                value.push(if expand_tabs { ' ' } else { char }); -            } -            Code::VirtualSpace => { -                if !expand_tabs && at_tab { -                    index += 1; -                    continue; -                } -                value.push(' '); -            } -            Code::Char(char) => { -                value.push(char); -            } -            Code::None => { -                unreachable!("unexpected EOF code in codes"); -            } -        } - -        at_tab = at_tab_next; - -        index += 1; -    } - -    value.into_iter().collect() -} | 
