From cf319a06396ad4e4886f034973153c3321190380 Mon Sep 17 00:00:00 2001 From: heck Date: Sun, 13 Dec 2020 23:57:27 +0100 Subject: [PATCH] gen is getting there (simple_ast on the way out) --- gen/ast_parser.py | 78 +++++++++++--- gen/extract.py | 255 +++++++++++++++++++--------------------------- gen/simple_ast.py | 144 ++++++++++++++++++++++++++ 3 files changed, 313 insertions(+), 164 deletions(-) create mode 100644 gen/simple_ast.py diff --git a/gen/ast_parser.py b/gen/ast_parser.py index 6e60316..5fe88d6 100644 --- a/gen/ast_parser.py +++ b/gen/ast_parser.py @@ -3,51 +3,97 @@ import clang.cindex from clang.cindex import CursorKind -class AST_Parser: - def __init__(self,library_file=None): +class ASTParser: + def __init__(self, library_file): if not clang.cindex.Config.loaded: print("Using libclang from: %s", library_file) clang.cindex.Config.set_library_file(library_file) + import sys + sys.setrecursionlimit(10000) + print("max recursion limit:", sys.getrecursionlimit()) - def parse(self, filename, content): + def parse(self, filename, content=None, follow_includes=False): index = clang.cindex.Index.create() arguments = ["-x", "c"] options = clang.cindex.TranslationUnit.PARSE_SKIP_FUNCTION_BODIES - content = [(filename, content)] + if content: + content = [(filename, content)] + translation_unit = index.parse(filename, unsaved_files=content, args=arguments, options=options) - ret = self._parse(translation_unit.cursor, filename) + ret = self._parse(translation_unit.cursor, filename, follow_includes) return ret - def _get_children_filelocal(self, cursor, path): - return [c for c in cursor.get_children() if c.location.file and c.location.file.name == path] + def _get_children(self, cursor, path, follow_includes=False): + if follow_includes: + return [c for c in cursor.get_children()] + else: + return [c for c in cursor.get_children() if c.location.file and c.location.file.name == path] - def _parse(self, cursor, path): + def _parse(self, cursor, path, follow_includes=False): item = {} + + # dont parse excluded CursorKinds excluded_cursortypes = [CursorKind.INTEGER_LITERAL] if not cursor.kind in excluded_cursortypes: - if not str(cursor.kind) == "": + # generic info for all CursorKinds + if str(cursor.get_tokens()): + str_tok = "" + for tok in cursor.get_tokens(): + str_tok += tok.spelling + " " + item["tokens"] = str_tok.rstrip() + + if str(cursor.kind): item["kind"] = str(cursor.kind) - if not cursor.spelling == "": + if cursor.spelling: item["name"] = cursor.spelling - if not cursor.displayname == "": - item["displayname"] = cursor.displayname + if cursor.displayname: + if cursor.displayname != item["name"]: + item["displayname"] = cursor.displayname - if not cursor.type.spelling == "": + if cursor.type.spelling: item["type"] = cursor.type.spelling - if not cursor.result_type.spelling == "": + if cursor.result_type.spelling: item["result_type"] = cursor.result_type.spelling + if cursor.semantic_parent: + if cursor.semantic_parent.kind.is_translation_unit(): + item["semantic_parent"] = "global" + else: + item["semantic_parent"] = cursor.semantic_parent.spelling + + item["is_definition"] = False + if cursor.is_definition(): + item["is_definition"] = cursor.is_definition() + + + # TODO: Ever occrus? + if cursor.canonical: + if cursor.canonical.spelling != cursor.spelling: + item["canonical"] = cursor.canonical.spelling + + # TYPE_REF specific info + # if cursor.kind == CursorKind.TYPE_REF: + # if cursor.get_definition(): + # definition = cursor.get_definition() + # item["definition"] = self._parse(definition, path, follow_includes=follow_includes) + + # ENUM specific info if cursor.kind == CursorKind.ENUM_CONSTANT_DECL: item["value"] = cursor.enum_value - child_cursors = self._get_children_filelocal(cursor, path) + # TYPEDEF specific info + if cursor.kind == CursorKind.TYPEDEF_DECL: + item["utype"] = cursor.underlying_typedef_type.spelling + + # get direct children + child_cursors = self._get_children(cursor, path, follow_includes) if len(child_cursors) > 0: child_arr = [] for child_cursor in child_cursors: - child_result = self._parse(child_cursor, path) + child_result = self._parse(child_cursor, path, follow_includes=follow_includes) if child_result: child_arr.append(child_result) diff --git a/gen/extract.py b/gen/extract.py index 5298369..202b235 100755 --- a/gen/extract.py +++ b/gen/extract.py @@ -3,10 +3,11 @@ import os import json -from ast_parser import AST_Parser +from ast_parser import ASTParser +from simple_ast import SimpleAST -def create_paths_list(dirname, filenames): +def join_dir_and_filenames(dirname, filenames): paths = [] for basename in filenames: path = dirname + basename @@ -14,170 +15,128 @@ def create_paths_list(dirname, filenames): return paths -def read_files(paths): - content = [] - for path in paths: - file_info = read_file(path) - content.append(file_info) - return content - - def read_file(path): with open(path) as f: file_content = f.read() - item = {"path": path, - "sourcecode": file_content} - return item + return file_content -def write_json(header, key): - outpath = header["outpath"] + "." + key + ".json" - with open(outpath, "w+") as f: - json.dump(header[key], f, indent=4) +# out-dir is in-dir if not specified +def create_header(path, out_dir=None): + header = {"path": "", + "dir": "", + "filename": "", + "out_dir": "", + "sourcecode": ""} + header["path"] = path + header["dir"] = os.path.dirname(path) + header["filename"] = os.path.basename(path) -def prepare_header(header, out_dir): - basename = os.path.basename(header.get("path")) - outpath = out_dir + basename - header["outpath"] = outpath + header["out_dir"] = header["dir"] + if out_dir: + header["out_dir"] = out_dir + + header["sourcecode"] = read_file(path) return header -def create_simple_ast(ast): - elems = {"functions": "", - "structs": "", - "enums": ""} - elems["functions"] = extract_functions_from_ast(ast) - elems["structs"] = extract_structs_from_ast(ast) - elems["enums"] = extract_enums_from_ast(ast) - return elems - - -# extracts top level functions only (is there anything else in C?) -def extract_functions_from_ast(ast): - functions = [] - for child in ast["children"]: - if child["kind"] == "CursorKind.FUNCTION_DECL": - functions.append(simple_ast_functions(child)) - return functions - - -def simple_ast_functions(func_ast): - simple_func = {"name": "", - "return_type": "", - "arguments": []} - - simple_func["name"] = func_ast["name"] - simple_func["return_type"] = func_ast["result_type"] - arguments = [] - # check if func has args - if "children" in func_ast: - for arg_ast in func_ast["children"]: - arg_simple = None - if arg_ast["kind"] == "CursorKind.PARM_DECL": - arg_simple = {"name": "", - "type": ""} - arg_simple["name"] = arg_ast["name"] - arg_simple["type"] = arg_ast["type"] - if arg_simple: - arguments.append(arg_simple) - - if arguments: - simple_func["arguments"] = arguments - return simple_func - - -# only typedef enums -def extract_enums_from_ast(ast): - enums = [] - for typedef in ast["children"]: - if typedef["kind"] == "CursorKind.TYPEDEF_DECL": - typename = typedef["type"] - if "children" in typedef: - for enum in typedef["children"]: - if enum["kind"] == "CursorKind.ENUM_DECL": - enums.append(simple_ast_enums(enum, typename)) - - return enums - - -def simple_ast_enums(enum_ast, typename): - simple_enum = {"name": typename, - "items": []} - - items = [] - if "children" in enum_ast: - for item in enum_ast["children"]: - if item["kind"] == "CursorKind.ENUM_CONSTANT_DECL": - simple_item = {"name": "", - "value": ""} - simple_item["name"] = item["name"] - simple_item["value"] = item["value"] - items.append(simple_item) - - if items: - simple_enum["items"] = items - - return simple_enum - - -# only typedef structs -def extract_structs_from_ast(ast): - structs = [] - for typedef in ast["children"]: - if typedef["kind"] == "CursorKind.TYPEDEF_DECL": - typename = typedef["type"] - if "children" in typedef: - for struct in typedef["children"]: - if struct["kind"] == "CursorKind.STRUCT_DECL": - structs.append(simple_ast_structs(struct, typename)) - - return structs - -def simple_ast_structs(struct_ast, typename): - simple_struct = {"name" : typename, - "fields" : []} - - fields = [] - if "children" in struct_ast: - for field in struct_ast["children"]: - if field["kind"] == "CursorKind.FIELD_DECL": - simple_field = {"name": "", - "type": ""} - simple_field["name"] = field["name"] - simple_field["type"] = field["type"] - fields.append(simple_field) - - if fields: - simple_struct["fields"] = fields - - return simple_struct +def write_json(content, outpath): + # create path if not existing + out_dir = os.path.dirname(outpath) + if not os.path.isdir(out_dir): + os.makedirs(out_dir) + # write + with open(outpath, "w+") as f: + json.dump(content, f, indent=4) + +# generates simple-ast for each header specified in spec out dir. +def main_old(): + parser = ASTParser("/opt/local/libexec/llvm-9.0/lib/libclang.dylib") -def main(): - input() - parser = AST_Parser("/opt/local/libexec/llvm-9.0/lib/libclang.dylib") - # Input - prefix = r"/Users/heck/local-default/" + in_dir = r"/Users/heck/local-default/include/pEp/" filenames = ["pEpEngine.h", - "keymanagement.h"] - # Output - out_dir = "data/output/" - if not os.path.isdir(out_dir): - os.makedirs(out_dir) + "keymanagement.h", + "message_api.h", + "message.h", + "sync_api.h", + "key_reset.h", + "Identity.h", + "Rating.h"] - in_dir = prefix + r"include/pEp/" - paths = create_paths_list(in_dir, filenames) - headers = read_files(paths) + out_dir = "data/output" + + paths = join_dir_and_filenames(in_dir, filenames) + + headers = [] + for path in paths: + headers.append(create_header(path, out_dir)) for header in headers: - header = prepare_header(header, out_dir) - print("processing path: " + header.get("path") + "...") + print("processing path: " + header["path"] + "...") header["ast"] = parser.parse(header["path"], header["sourcecode"]) - write_json(header, "ast") + write_json(header["ast"], header["out_dir"] + "/" + header["filename"] + ".ast.json") + + simpleAst = SimpleAST() + header["simple_ast"] = simpleAst.create_simple_ast(header["ast"]) + write_json(header["simple_ast"], header["out_dir"] + "/" + header["filename"] + ".simple_ast.json") - header["simple_ast"] = create_simple_ast(header["ast"]) - write_json(header, "simple_ast") + +def recursive_query(data, filter, transform=lambda x: x): + resultset = [] + + # decorator just handling exceptions + def filter_decorator(data): + try: + return filter(data) + except KeyError: + pass + + # filter current data + if filter_decorator(data): + # transform result + xformed = transform(data) + if xformed: + resultset.append(xformed) + + # recurse + if "children" in data: + for item in data["children"]: + childres = recursive_query(item, filter, transform) + if childres: + resultset += childres + + return resultset + + +def main_new(): + parser = ASTParser("/opt/local/libexec/llvm-9.0/lib/libclang.dylib") + + # header = create_header("/Users/heck/local-default/include/pEp/pEpEngine.h", out_dir="./") + header = create_header("data/input/test_data/main_include.h") + + header["ast"] = parser.parse(header["path"], follow_includes=True) + write_json(header["ast"], header["out_dir"] + "/" + header["filename"] + ".ast.json") + + # query + def filter_xzy(item): + if (item["is_definition"] == False + and item["kind"] == "CursorKind.STRUCT_DECL" + ): + return True + + def xform(item): + return item + + matches = recursive_query(header["ast"], filter_xzy, xform) + # matches = list(set(matches)) + write_json(matches, header["out_dir"] + "/" + header["filename"] + ".matches.json") + + +def main(): + main_old() + # main_new() if __name__ == "__main__": diff --git a/gen/simple_ast.py b/gen/simple_ast.py new file mode 100644 index 0000000..a9154b4 --- /dev/null +++ b/gen/simple_ast.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- + +class SimpleAST: + + def __init__(self): + pass + + def create_simple_ast(self,ast): + elems = {"functions": "", + "typedefs": "", + "structs": "", + "enums": ""} + elems["functions"] = self.extract_functions_from_ast(ast) + elems["typedefs"] = self.extract_typedefs_from_ast(ast) + elems["structs"] = self.extract_structs_from_ast(ast) + elems["enums"] = self.extract_enums_from_ast(ast) + return elems + + + # extracts top level functions only (is there anything else in C?) + def extract_functions_from_ast(self,ast): + functions = [] + for child in ast["children"]: + if child["kind"] == "CursorKind.FUNCTION_DECL": + functions.append(self._simple_ast_functions(child)) + return functions + + + def _simple_ast_functions(self, func_ast): + simple_func = {"name": "", + "return_type": "", + "arguments": []} + + simple_func["name"] = func_ast["name"] + simple_func["return_type"] = func_ast["result_type"] + arguments = [] + # check if func has args + if "children" in func_ast: + for arg_ast in func_ast["children"]: + arg_simple = None + if arg_ast["kind"] == "CursorKind.PARM_DECL": + arg_simple = {"name": "", + "type": ""} + arg_simple["name"] = arg_ast["name"] + arg_simple["type"] = arg_ast["type"] + if arg_simple: + arguments.append(arg_simple) + + if arguments: + simple_func["arguments"] = arguments + return simple_func + + + # only typedef enums + def extract_enums_from_ast(self, ast): + enums = [] + for typedef in ast["children"]: + if typedef["kind"] == "CursorKind.TYPEDEF_DECL": + typename = typedef["type"] + if "children" in typedef: + for enum in typedef["children"]: + if enum["kind"] == "CursorKind.ENUM_DECL": + enums.append(self._simple_ast_enums(enum, typename)) + + return enums + + + def _simple_ast_enums(self, enum_ast, typename): + simple_enum = {"name": typename, + "items": []} + + items = [] + if "children" in enum_ast: + for item in enum_ast["children"]: + if item["kind"] == "CursorKind.ENUM_CONSTANT_DECL": + simple_item = {"name": "", + "value": ""} + simple_item["name"] = item["name"] + simple_item["value"] = item["value"] + items.append(simple_item) + + if items: + simple_enum["items"] = items + + return simple_enum + + + # only typedef structs + def extract_structs_from_ast(self, ast): + structs = [] + for typedef in ast["children"]: + if typedef["kind"] == "CursorKind.TYPEDEF_DECL": + typename = typedef["type"] + if "children" in typedef: + for struct in typedef["children"]: + if struct["kind"] == "CursorKind.STRUCT_DECL": + structs.append(self._simple_ast_structs(struct, typename)) + + return structs + + + def _simple_ast_structs(self, struct_ast, typename): + simple_struct = {"name": typename, + "fields": []} + + fields = [] + if "children" in struct_ast: + for field in struct_ast["children"]: + if field["kind"] == "CursorKind.FIELD_DECL": + simple_field = {"name": "", + "type": ""} + simple_field["name"] = field["name"] + simple_field["type"] = field["type"] + fields.append(simple_field) + + if fields: + simple_struct["fields"] = fields + + return simple_struct + + + def extract_typedefs_from_ast(self, ast): + typdefs = [] + for typedef in ast["children"]: + if typedef["kind"] == "CursorKind.TYPEDEF_DECL": + typename = typedef["type"] + if "children" in typedef: + children = typedef["children"] + if children: + ref = children[0] + if ref["kind"] == "CursorKind.TYPE_REF": + typdefs.append(self._simple_ast_typedefs(ref, typename)) + + return typdefs + + + def _simple_ast_typedefs(self, typedef_ast, typename): + simple_typedef = {"name": typename, + "kind" : "", + "value": ""} + simple_typedef["kind"] = typedef_ast["kind"] + simple_typedef["value"] = typedef_ast["name"] + + return simple_typedef