| | |
| | |
| |
|
| | import re |
| | from io import StringIO |
| | import tokenize |
| | def remove_comments_and_docstrings(source,lang): |
| | if lang in ['python']: |
| | """ |
| | Returns 'source' minus comments and docstrings. |
| | """ |
| | io_obj = StringIO(source) |
| | out = "" |
| | prev_toktype = tokenize.INDENT |
| | last_lineno = -1 |
| | last_col = 0 |
| | for tok in tokenize.generate_tokens(io_obj.readline): |
| | token_type = tok[0] |
| | token_string = tok[1] |
| | start_line, start_col = tok[2] |
| | end_line, end_col = tok[3] |
| | ltext = tok[4] |
| | if start_line > last_lineno: |
| | last_col = 0 |
| | if start_col > last_col: |
| | out += (" " * (start_col - last_col)) |
| | |
| | if token_type == tokenize.COMMENT: |
| | pass |
| | |
| | elif token_type == tokenize.STRING: |
| | if prev_toktype != tokenize.INDENT: |
| | |
| | if prev_toktype != tokenize.NEWLINE: |
| | if start_col > 0: |
| | out += token_string |
| | else: |
| | out += token_string |
| | prev_toktype = token_type |
| | last_col = end_col |
| | last_lineno = end_line |
| | temp=[] |
| | for x in out.split('\n'): |
| | if x.strip()!="": |
| | temp.append(x) |
| | return '\n'.join(temp) |
| | elif lang in ['ruby']: |
| | return source |
| | else: |
| | def replacer(match): |
| | s = match.group(0) |
| | if s.startswith('/'): |
| | return " " |
| | else: |
| | return s |
| | pattern = re.compile( |
| | r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', |
| | re.DOTALL | re.MULTILINE |
| | ) |
| | temp=[] |
| | for x in re.sub(pattern, replacer, source).split('\n'): |
| | if x.strip()!="": |
| | temp.append(x) |
| | return '\n'.join(temp) |
| |
|
| | def tree_to_token_index(root_node): |
| | if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
| | return [(root_node.start_point,root_node.end_point)] |
| | else: |
| | code_tokens=[] |
| | for child in root_node.children: |
| | code_tokens+=tree_to_token_index(child) |
| | return code_tokens |
| | |
| | def tree_to_variable_index(root_node,index_to_code): |
| | if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment': |
| | index=(root_node.start_point,root_node.end_point) |
| | _,code=index_to_code[index] |
| | if root_node.type!=code: |
| | return [(root_node.start_point,root_node.end_point)] |
| | else: |
| | return [] |
| | else: |
| | code_tokens=[] |
| | for child in root_node.children: |
| | code_tokens+=tree_to_variable_index(child,index_to_code) |
| | return code_tokens |
| |
|
| | def index_to_code_token(index,code): |
| | start_point=index[0] |
| | end_point=index[1] |
| | if start_point[0]==end_point[0]: |
| | s=code[start_point[0]][start_point[1]:end_point[1]] |
| | else: |
| | s="" |
| | s+=code[start_point[0]][start_point[1]:] |
| | for i in range(start_point[0]+1,end_point[0]): |
| | s+=code[i] |
| | s+=code[end_point[0]][:end_point[1]] |
| | return s |
| | |