Parse macros with cexpr.

author: Emilio Cobos Álvarez <ecoal95@gmail.com> 2016-11-06 14:36:47 +0100
committer: Emilio Cobos Álvarez <ecoal95@gmail.com> 2016-11-08 20:55:42 +0100
commit: e22a11b4d8d37ff1ed850fd596a0110b433907ee (patch)
tree: 36a8d15a33bcfcfecafc8daa835c062403c2a3ec
parent: 7fe40e0cf92df36219308406dcb4130a848fb6f6 (diff)
8 files changed, 182 insertions, 73 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 4e455fa9..97bcb82f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -31,6 +31,7 @@ env_logger = "0.3"
 rustc-serialize = "0.3.19"
 syntex_syntax = "0.44"
 regex = "0.1"
+cexpr = "0.2"
 
 [dependencies.aster]
 features = ["with-syntex"]
diff --git a/src/clang.rs b/src/clang.rs
index e6d78123..177d7dab 100755
--- a/src/clang.rs
+++ b/src/clang.rs
@@ -4,8 +4,9 @@
 #![allow(non_upper_case_globals, dead_code)]
 
 
+use cexpr;
 use clangll::*;
-use std::{mem, ptr};
+use std::{mem, ptr, slice};
 use std::ffi::{CStr, CString};
 use std::fmt;
 use std::hash::Hash;
@@ -1051,18 +1052,18 @@ impl TranslationUnit {
         let range = cursor.extent();
         let mut tokens = vec![];
         unsafe {
-            let mut token_ptr = ::std::ptr::null_mut();
+            let mut token_ptr = ptr::null_mut();
             let mut num_tokens: c_uint = 0;
             clang_tokenize(self.x, range, &mut token_ptr, &mut num_tokens);
             if token_ptr.is_null() {
                 return None;
             }
-            let token_array = ::std::slice::from_raw_parts(token_ptr,
-                                                           num_tokens as usize);
+
+            let token_array = slice::from_raw_parts(token_ptr,
+                                                    num_tokens as usize);
             for &token in token_array.iter() {
                 let kind = clang_getTokenKind(token);
-                let spelling: String = clang_getTokenSpelling(self.x, token)
-                    .into();
+                let spelling = clang_getTokenSpelling(self.x, token).into();
 
                 tokens.push(Token {
                     kind: kind,
@@ -1073,6 +1074,62 @@ impl TranslationUnit {
         }
         Some(tokens)
     }
+
+    /// Convert a set of tokens from clang into `cexpr` tokens, for further
+    /// processing.
+    pub fn cexpr_tokens(&self,
+                        cursor: &Cursor)
+                        -> Option<Vec<cexpr::token::Token>> {
+        use cexpr::token;
+
+        let mut tokens = match self.tokens(cursor) {
+            Some(tokens) => tokens,
+            None => return None,
+        };
+
+        // FIXME(emilio): LLVM 3.9 at least always include an extra token for no
+        // good reason (except if we're at EOF). So we do this kind of hack,
+        // where we skip known-to-cause problems trailing punctuation and
+        // trailing keywords.
+        //
+        // This is sort of unfortunate, though :(.
+        //
+        // I'll try to get it fixed in LLVM if I have the time to submit a
+        // patch.
+        let mut trim_last_token = false;
+        if let Some(token) = tokens.last() {
+            // The starting of the next macro.
+            trim_last_token |= token.spelling == "#" &&
+                               token.kind == CXToken_Punctuation;
+
+            // A following keyword of any kind, like a following declaration.
+            trim_last_token |= token.kind == CXToken_Keyword;
+        }
+
+        if trim_last_token {
+            tokens.pop().unwrap();
+        }
+
+        Some(tokens.into_iter()
+            .filter_map(|token| {
+                let kind = match token.kind {
+                    CXToken_Punctuation => token::Kind::Punctuation,
+                    CXToken_Literal => token::Kind::Literal,
+                    CXToken_Identifier => token::Kind::Identifier,
+                    CXToken_Keyword => token::Kind::Keyword,
+                    // NB: cexpr is not too happy about comments inside
+                    // expressions, so we strip them down here.
+                    CXToken_Comment => return None,
+                    _ => panic!("Found unexpected token kind: {}", token.kind),
+                };
+
+                Some(token::Token {
+                    kind: kind,
+                    raw: token.spelling.into_bytes().into_boxed_slice(),
+                })
+            })
+            .collect::<Vec<_>>())
+    }
 }
 
 impl Drop for TranslationUnit {
diff --git a/src/ir/context.rs b/src/ir/context.rs
index f11b387a..977db9c5 100644
--- a/src/ir/context.rs
+++ b/src/ir/context.rs
@@ -1,10 +1,11 @@
 //! Common context that is passed around during parsing and codegen.
 
 use BindgenOptions;
+use cexpr;
 use clang::{self, Cursor};
 use parse::ClangItemParser;
 use std::borrow::{Borrow, Cow};
-use std::collections::{HashMap, HashSet, hash_map};
+use std::collections::{HashMap, hash_map};
 use std::collections::btree_map::{self, BTreeMap};
 use std::fmt;
 use super::int::IntKind;
@@ -77,8 +78,9 @@ pub struct BindgenContext<'ctx> {
     pub currently_parsed_types: Vec<(Cursor, ItemId)>,
 
     /// A HashSet with all the already parsed macro names. This is done to avoid
-    /// hard errors while parsing duplicated macros.
-    parsed_macros: HashSet<String>,
+    /// hard errors while parsing duplicated macros, as well to allow macro
+    /// expression parsing.
+    parsed_macros: HashMap<Vec<u8>, cexpr::expr::EvalResult>,
 
     /// The active replacements collected from replaces="xxx" annotations.
     replacements: HashMap<String, ItemId>,
@@ -715,14 +717,21 @@ impl<'ctx> BindgenContext<'ctx> {
     }
 
     /// Have we parsed the macro named `macro_name` already?
-    pub fn parsed_macro(&self, macro_name: &str) -> bool {
-        self.parsed_macros.contains(macro_name)
+    pub fn parsed_macro(&self, macro_name: &[u8]) -> bool {
+        self.parsed_macros.contains_key(macro_name)
+    }
+
+    /// Get the currently parsed macros.
+    pub fn parsed_macros(&self) -> &HashMap<Vec<u8>, cexpr::expr::EvalResult> {
+        debug_assert!(!self.in_codegen_phase());
+        &self.parsed_macros
     }
 
     /// Mark the macro named `macro_name` as parsed.
-    pub fn note_parsed_macro(&mut self, macro_name: String) {
-        debug_assert!(!self.parsed_macros.contains(&macro_name));
-        self.parsed_macros.insert(macro_name);
+    pub fn note_parsed_macro(&mut self,
+                             id: Vec<u8>,
+                             value: cexpr::expr::EvalResult) {
+        self.parsed_macros.insert(id, value);
     }
 
     /// Are we in the codegen phase?
diff --git a/src/ir/var.rs b/src/ir/var.rs
index 33e56242..62f17030 100644
--- a/src/ir/var.rs
+++ b/src/ir/var.rs
@@ -1,7 +1,9 @@
 //! Intermediate representation of variables.
 
+use cexpr;
 use clang;
 use parse::{ClangItemParser, ClangSubItemParser, ParseError, ParseResult};
+use std::num::Wrapping;
 use super::context::BindgenContext;
 use super::function::cursor_mangling;
 use super::int::IntKind;
@@ -73,43 +75,61 @@ impl ClangSubItemParser for Var {
              ctx: &mut BindgenContext)
              -> Result<ParseResult<Self>, ParseError> {
         use clangll::*;
+        use cexpr::expr::EvalResult;
         match cursor.kind() {
             CXCursor_MacroDefinition => {
-                let value = parse_int_literal_tokens(&cursor,
-                                                     ctx.translation_unit());
+                let value = parse_macro(ctx, &cursor, ctx.translation_unit());
 
-                let value = match value {
+                let (id, value) = match value {
                     Some(v) => v,
                     None => return Err(ParseError::Continue),
                 };
 
-                let name = cursor.spelling();
-                if name.is_empty() {
-                    warn!("Empty macro name?");
-                    return Err(ParseError::Continue);
-                }
+                assert!(!id.is_empty(), "Empty macro name?");
 
-                if ctx.parsed_macro(&name) {
+                if ctx.parsed_macro(&id) {
+                    let name = String::from_utf8(id).unwrap();
                     warn!("Duplicated macro definition: {}", name);
                     return Err(ParseError::Continue);
                 }
-                ctx.note_parsed_macro(name.clone());
-
-                let ty = if value < 0 {
-                    Item::builtin_type(TypeKind::Int(IntKind::Int), true, ctx)
-                } else if value.abs() > u32::max_value() as i64 {
-                    Item::builtin_type(TypeKind::Int(IntKind::ULongLong),
-                                       true,
-                                       ctx)
-                } else {
-                    Item::builtin_type(TypeKind::Int(IntKind::UInt), true, ctx)
+
+                // NB: It's important to "note" the macro even if the result is
+                // not an integer, otherwise we might loose other kind of
+                // derived macros.
+                ctx.note_parsed_macro(id.clone(), value.clone());
+
+                // NOTE: Unwrapping, here and above, is safe, because the
+                // identifier of a token comes straight from clang, and we
+                // enforce utf8 there, so we should have already panicked at
+                // this point.
+                let name = String::from_utf8(id).unwrap();
+                let (int_kind, val) = match value {
+                    // TODO(emilio): Handle the non-invalid ones!
+                    EvalResult::Float(..) |
+                    EvalResult::Char(..) |
+                    EvalResult::Str(..) |
+                    EvalResult::Invalid => return Err(ParseError::Continue),
+
+                    EvalResult::Int(Wrapping(value)) => {
+                        let kind = if value < 0 {
+                            if value < i32::min_value() as i64 {
+                                IntKind::LongLong
+                            } else {
+                                IntKind::Int
+                            }
+                        } else if value > u32::max_value() as i64 {
+                            IntKind::ULongLong
+                        } else {
+                            IntKind::UInt
+                        };
+
+                        (kind, value)
+                    }
                 };
 
-                Ok(ParseResult::New(Var::new(name,
-                                             None,
-                                             ty,
-                                             Some(value),
-                                             true),
+                let ty = Item::builtin_type(TypeKind::Int(int_kind), true, ctx);
+
+                Ok(ParseResult::New(Var::new(name, None, ty, Some(val), true),
                                     Some(cursor)))
             }
             CXCursor_VarDecl => {
@@ -153,49 +173,43 @@ impl ClangSubItemParser for Var {
     }
 }
 
-/// Try and parse the immediately found tokens from an unit (if any) to integers
+/// Try and parse a macro using all the macros parsed until now.
+fn parse_macro(ctx: &BindgenContext,
+               cursor: &clang::Cursor,
+               unit: &clang::TranslationUnit)
+               -> Option<(Vec<u8>, cexpr::expr::EvalResult)> {
+    use cexpr::{expr, nom};
+
+    let cexpr_tokens = match unit.cexpr_tokens(cursor) {
+        None => return None,
+        Some(tokens) => tokens,
+    };
+
+    let parser = expr::IdentifierParser::new(ctx.parsed_macros());
+    let result = parser.macro_definition(&cexpr_tokens);
+
+    match result {
+        nom::IResult::Done(_, (id, val)) => Some((id.into(), val)),
+        _ => None,
+    }
+}
+
 fn parse_int_literal_tokens(cursor: &clang::Cursor,
                             unit: &clang::TranslationUnit)
                             -> Option<i64> {
-    use clangll::{CXToken_Literal, CXToken_Punctuation};
+    use cexpr::{expr, nom};
+    use cexpr::expr::EvalResult;
 
-    let tokens = match unit.tokens(cursor) {
+    let cexpr_tokens = match unit.cexpr_tokens(cursor) {
         None => return None,
         Some(tokens) => tokens,
     };
 
-    let mut literal = None;
-    let mut negate = false;
-    for token in tokens.into_iter() {
-        match token.kind {
-            CXToken_Punctuation if token.spelling == "-" => {
-                negate = !negate;
-            }
-            CXToken_Literal => {
-                literal = Some(token.spelling);
-                break;
-            }
-            _ => {
-                // Reset values if we found anything else
-                negate = false;
-                literal = None;
-            }
-        }
+    // TODO(emilio): We can try to parse other kinds of literals.
+    match expr::expr(&cexpr_tokens) {
+        nom::IResult::Done(_, EvalResult::Int(Wrapping(val))) => Some(val),
+        _ => None,
     }
-
-    literal.and_then(|lit| {
-            if lit.starts_with("0x") {
-                // TODO: try to preserve hex literals?
-                i64::from_str_radix(&lit[2..], 16).ok()
-            } else if lit == "0" {
-                Some(0)
-            } else if lit.starts_with("0") {
-                i64::from_str_radix(&lit[1..], 8).ok()
-            } else {
-                lit.parse().ok()
-            }
-        })
-        .map(|lit| if negate { -lit } else { lit })
 }
 
 fn get_integer_literal_from_cursor(cursor: &clang::Cursor,
diff --git a/src/lib.rs b/src/lib.rs
index 03dac3cc..a632c461 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -24,6 +24,7 @@
 
 #[macro_use]
 extern crate cfg_if;
+extern crate cexpr;
 extern crate syntex_syntax as syntax;
 extern crate aster;
 extern crate quasi;
diff --git a/tests/expectations/tests/jsval_layout_opaque.rs b/tests/expectations/tests/jsval_layout_opaque.rs
index f3c1014e..fa611f20 100644
--- a/tests/expectations/tests/jsval_layout_opaque.rs
+++ b/tests/expectations/tests/jsval_layout_opaque.rs
@@ -24,8 +24,9 @@ impl <T> ::std::clone::Clone for __BindgenUnionField<T> {
     fn clone(&self) -> Self { Self::new() }
 }
 impl <T> ::std::marker::Copy for __BindgenUnionField<T> { }
-pub const JSVAL_ALIGNMENT: ::std::os::raw::c_uint = 8;
 pub const JSVAL_TAG_SHIFT: ::std::os::raw::c_uint = 47;
+pub const JSVAL_PAYLOAD_MASK: ::std::os::raw::c_ulonglong = 140737488355327;
+pub const JSVAL_TAG_MASK: ::std::os::raw::c_longlong = -140737488355328;
 #[repr(u8)]
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
 pub enum JSValueType {
diff --git a/tests/expectations/tests/macro-expr-basic.rs b/tests/expectations/tests/macro-expr-basic.rs
new file mode 100644
index 00000000..7a5c71e0
--- /dev/null
+++ b/tests/expectations/tests/macro-expr-basic.rs
@@ -0,0 +1,14 @@
+/* automatically generated by rust-bindgen */
+
+
+#![allow(non_snake_case)]
+
+
+pub const FOO: ::std::os::raw::c_uint = 1;
+pub const BAR: ::std::os::raw::c_uint = 4;
+pub const BAZ: ::std::os::raw::c_uint = 5;
+pub const BARR: ::std::os::raw::c_uint = 1;
+pub const BAZZ: ::std::os::raw::c_uint = 7;
+pub const I_RAN_OUT_OF_DUMB_NAMES: ::std::os::raw::c_uint = 7;
+pub const HAZ_A_COMMENT: ::std::os::raw::c_uint = 1;
+pub const HAZ_A_COMMENT_INSIDE: ::std::os::raw::c_uint = 2;
diff --git a/tests/headers/macro-expr-basic.h b/tests/headers/macro-expr-basic.h
new file mode 100644
index 00000000..55b11367
--- /dev/null
+++ b/tests/headers/macro-expr-basic.h
@@ -0,0 +1,12 @@
+#define FOO 1
+#define BAR 4
+#define BAZ (FOO + BAR)
+
+#define BARR (1 << 0)
+#define BAZZ ((1 << 1) + BAZ)
+#define I_RAN_OUT_OF_DUMB_NAMES (BARR | BAZZ)
+
+/* I haz a comment */
+#define HAZ_A_COMMENT BARR
+
+#define HAZ_A_COMMENT_INSIDE (/* comment for real */ BARR + FOO)
author	Emilio Cobos Álvarez <ecoal95@gmail.com>	2016-11-06 14:36:47 +0100
committer	Emilio Cobos Álvarez <ecoal95@gmail.com>	2016-11-08 20:55:42 +0100
commit	e22a11b4d8d37ff1ed850fd596a0110b433907ee (patch)
tree	36a8d15a33bcfcfecafc8daa835c062403c2a3ec
parent	7fe40e0cf92df36219308406dcb4130a848fb6f6 (diff)