clang: Tokenize more lazily.

Instead of converting all the tokens to utf-8 before-hand, which is costly, and allocating a new vector unconditionally (on top of the one clang already allocates), just do the tokenization more lazily. There's actually only one place in the codebase which needs the utf-8 string, all the others can just work with the byte slice from clang. This should have no behavior change, other than be faster. In particular, this halves the time on my machine spent on the test-case from #1465. I'm not completely sure that this is going to be enough to make it acceptable, but we should probably do it regardless.
author: Emilio Cobos Álvarez <emilio@crisal.io> 2018-12-14 03:10:20 +0100
committer: Emilio Cobos Álvarez <emilio@crisal.io> 2018-12-14 10:57:40 +0100
commit: 7109c48a6b0eb770b805810ff1ca8046b3e1cde2 (patch)
tree: 00913e59a7cf79f1fdad7ab4ca7a3e057dce91be
parent: 698758e06fe00b756e37842f41f6c42d450f5ed8 (diff)
3 files changed, 125 insertions, 69 deletions
diff --git a/src/clang.rs b/src/clang.rs
index a31cba31..8a3e3c68 100644
--- a/src/clang.rs
+++ b/src/clang.rs
@@ -507,11 +507,9 @@ impl Cursor {
         let mut found_attr = false;
         self.visit(|cur| {
             if cur.kind() == CXCursor_UnexposedAttr {
-                found_attr = cur.tokens().map(|tokens| {
-                    tokens.iter().any(|t| {
-                        t.kind == CXToken_Identifier && t.spelling == attr
-                    })
-                }).unwrap_or(false);
+                found_attr = cur.tokens().iter().any(|t| {
+                    t.kind == CXToken_Identifier && t.spelling() == attr.as_bytes()
+                });
 
                 if found_attr {
                     return CXChildVisit_Break;
@@ -653,64 +651,126 @@ impl Cursor {
     }
 
     /// Gets the tokens that correspond to that cursor.
-    pub fn tokens(&self) -> Option<Vec<Token>> {
-        let range = self.extent();
-        let mut tokens = vec![];
-        unsafe {
-            let tu = clang_Cursor_getTranslationUnit(self.x);
-            let mut token_ptr = ptr::null_mut();
-            let mut num_tokens: c_uint = 0;
-            clang_tokenize(tu, range, &mut token_ptr, &mut num_tokens);
-            if token_ptr.is_null() {
-                return None;
-            }
+    pub fn tokens(&self) -> RawTokens {
+        RawTokens::new(self)
+    }
 
-            let token_array =
-                slice::from_raw_parts(token_ptr, num_tokens as usize);
-            for &token in token_array.iter() {
-                let kind = clang_getTokenKind(token);
-                let spelling =
-                    cxstring_into_string(clang_getTokenSpelling(tu, token));
+    /// Gets the tokens that correspond to that cursor as  `cexpr` tokens.
+    pub fn cexpr_tokens(self) -> Vec<cexpr::token::Token> {
+        use cexpr::token;
 
-                tokens.push(Token {
-                    kind: kind,
-                    spelling: spelling,
-                });
+        self.tokens().iter().filter_map(|token| {
+            let kind = match token.kind {
+                CXToken_Punctuation => token::Kind::Punctuation,
+                CXToken_Literal => token::Kind::Literal,
+                CXToken_Identifier => token::Kind::Identifier,
+                CXToken_Keyword => token::Kind::Keyword,
+                // NB: cexpr is not too happy about comments inside
+                // expressions, so we strip them down here.
+                CXToken_Comment => return None,
+                _ => {
+                    error!("Found unexpected token kind: {:?}", token);
+                    return None;
+                }
+            };
+
+            Some(token::Token {
+                kind,
+                raw: token.spelling().to_vec().into_boxed_slice(),
+            })
+        }).collect()
+    }
+}
+
+/// A struct that owns the tokenizer result from a given cursor.
+pub struct RawTokens<'a> {
+    cursor: &'a Cursor,
+    tu: CXTranslationUnit,
+    tokens: *mut CXToken,
+    token_count: c_uint,
+}
+
+impl<'a> RawTokens<'a> {
+    fn new(cursor: &'a Cursor) -> Self {
+        let mut tokens = ptr::null_mut();
+        let mut token_count = 0;
+        let range = cursor.extent();
+        let tu = unsafe {
+            clang_Cursor_getTranslationUnit(cursor.x)
+        };
+        unsafe { clang_tokenize(tu, range, &mut tokens, &mut token_count) };
+        Self { cursor, tu, tokens, token_count }
+    }
+
+    fn as_slice(&self) -> &[CXToken] {
+        if self.tokens.is_null() {
+            return &[];
+        }
+        unsafe { slice::from_raw_parts(self.tokens, self.token_count as usize) }
+    }
+
+    /// Get an iterator over these tokens.
+    pub fn iter(&self) -> ClangTokenIterator {
+        ClangTokenIterator {
+            tu: self.tu,
+            raw: self.as_slice().iter(),
+        }
+    }
+}
+
+impl<'a> Drop for RawTokens<'a> {
+    fn drop(&mut self) {
+        if !self.tokens.is_null() {
+            unsafe {
+                clang_disposeTokens(self.tu, self.tokens, self.token_count as c_uint);
             }
-            clang_disposeTokens(tu, token_ptr, num_tokens);
         }
-        Some(tokens)
     }
+}
 
-    /// Gets the tokens that correspond to that cursor as  `cexpr` tokens.
-    pub fn cexpr_tokens(self) -> Option<Vec<cexpr::token::Token>> {
-        use cexpr::token;
+/// A raw clang token, that exposes only the kind and spelling. This is a
+/// slightly more convenient version of `CXToken` which owns the spelling
+/// string.
+#[derive(Debug)]
+pub struct ClangToken {
+    spelling: CXString,
+    /// The kind of token, this is the same as the relevant member from
+    /// `CXToken`.
+    pub kind: CXTokenKind,
+}
 
-        self.tokens().map(|tokens| {
-            tokens
-                .into_iter()
-                .filter_map(|token| {
-                    let kind = match token.kind {
-                        CXToken_Punctuation => token::Kind::Punctuation,
-                        CXToken_Literal => token::Kind::Literal,
-                        CXToken_Identifier => token::Kind::Identifier,
-                        CXToken_Keyword => token::Kind::Keyword,
-                        // NB: cexpr is not too happy about comments inside
-                        // expressions, so we strip them down here.
-                        CXToken_Comment => return None,
-                        _ => {
-                            error!("Found unexpected token kind: {:?}", token);
-                            return None;
-                        }
-                    };
-
-                    Some(token::Token {
-                        kind: kind,
-                        raw: token.spelling.into_bytes().into_boxed_slice(),
-                    })
-                })
-                .collect::<Vec<_>>()
-        })
+impl ClangToken {
+    /// Get the token spelling, without being converted to utf-8.
+    pub fn spelling(&self) -> &[u8] {
+        let c_str = unsafe {
+            CStr::from_ptr(clang_getCString(self.spelling) as *const _)
+        };
+        c_str.to_bytes()
+    }
+}
+
+impl Drop for ClangToken {
+    fn drop(&mut self) {
+        unsafe { clang_disposeString(self.spelling) }
+    }
+}
+
+/// An iterator over a set of Tokens.
+pub struct ClangTokenIterator<'a> {
+    tu: CXTranslationUnit,
+    raw: slice::Iter<'a, CXToken>,
+}
+
+impl<'a> Iterator for ClangTokenIterator<'a> {
+    type Item = ClangToken;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let raw = self.raw.next()?;
+        unsafe {
+            let kind = clang_getTokenKind(*raw);
+            let spelling = clang_getTokenSpelling(self.tu, *raw);
+            Some(ClangToken { kind, spelling })
+        }
     }
 }
 
diff --git a/src/ir/context.rs b/src/ir/context.rs
index 063c79f0..f8b4f54a 100644
--- a/src/ir/context.rs
+++ b/src/ir/context.rs
@@ -2163,21 +2163,17 @@ If you encounter an error missing from this list, please file an issue or a PR!"
 
         let mut module_name = None;
         let spelling = cursor.spelling();
-        if !spelling.is_empty()
-        {
+        if !spelling.is_empty() {
             module_name = Some(spelling)
         }
 
-        let tokens = match cursor.tokens() {
-            Some(tokens) => tokens,
-            None => return (module_name, ModuleKind::Normal),
-        };
+        let tokens = cursor.tokens();
         let mut iter = tokens.iter();
         let mut kind = ModuleKind::Normal;
         let mut found_namespace_keyword = false;
         while let Some(token) = iter.next() {
-            match &*token.spelling {
-                "inline" => {
+            match token.spelling() {
+                b"inline" => {
                     assert!(!found_namespace_keyword);
                     assert!(kind != ModuleKind::Inline);
                     kind = ModuleKind::Inline;
@@ -2192,16 +2188,16 @@ If you encounter an error missing from this list, please file an issue or a PR!"
                 //
                 // Fortunately enough, inline nested namespace specifiers aren't
                 // a thing, and are invalid C++ :)
-                "namespace" | "::" => {
+                b"namespace" | b"::" => {
                     found_namespace_keyword = true;
                 }
-                "{" => {
+                b"{" => {
                     assert!(found_namespace_keyword);
                     break;
                 }
                 name if found_namespace_keyword => {
                     if module_name.is_none() {
-                        module_name = Some(name.to_owned());
+                        module_name = Some(String::from_utf8_lossy(name).into_owned());
                     }
                     break;
                 }
diff --git a/src/ir/var.rs b/src/ir/var.rs
index 14f133fd..2180a1b8 100644
--- a/src/ir/var.rs
+++ b/src/ir/var.rs
@@ -309,7 +309,7 @@ fn parse_macro(
 ) -> Option<(Vec<u8>, cexpr::expr::EvalResult)> {
     use cexpr::expr;
 
-    let mut cexpr_tokens = cursor.cexpr_tokens()?;
+    let mut cexpr_tokens = cursor.cexpr_tokens();
 
     let parser = expr::IdentifierParser::new(ctx.parsed_macros());
 
@@ -338,7 +338,7 @@ fn parse_int_literal_tokens(cursor: &clang::Cursor) -> Option<i64> {
     use cexpr::expr;
     use cexpr::expr::EvalResult;
 
-    let cexpr_tokens = cursor.cexpr_tokens()?;
+    let cexpr_tokens = cursor.cexpr_tokens();
 
     // TODO(emilio): We can try to parse other kinds of literals.
     match expr::expr(&cexpr_tokens) {
author	Emilio Cobos Álvarez <emilio@crisal.io>	2018-12-14 03:10:20 +0100
committer	Emilio Cobos Álvarez <emilio@crisal.io>	2018-12-14 10:57:40 +0100
commit	7109c48a6b0eb770b805810ff1ca8046b3e1cde2 (patch)
tree	00913e59a7cf79f1fdad7ab4ca7a3e057dce91be
parent	698758e06fe00b756e37842f41f6c42d450f5ed8 (diff)