فهرست منبع

Expand source stats to roughly cover more tokens. (#3277)

This is pretty rough both in code-quality and what it tries to cover.
The coverage approximation is probably fine as it mostly just gives some
vague idea of the distributions we should aim at when evaluating
performance. We still want performance with unusual distributions to be
good.

Happy to improve the code quality if folks want, and especially with
suggestions on what would help. For this kind of quick scripting, it
seemed "fine" to me.
Chandler Carruth 2 سال پیش
والد
کامیت
0e8e94d8bf
1فایلهای تغییر یافته به همراه125 افزوده شده و 2 حذف شده
  1. 125 2
      scripts/source_stats.py

+ 125 - 2
scripts/source_stats.py

@@ -27,6 +27,12 @@ LINE_RE = re.compile(
     (?P<class_intro>\b(class|struct)\s+(?P<class_name>\w+)\b)|
     (?P<class_intro>\b(class|struct)\s+(?P<class_name>\w+)\b)|
     (?P<end_open_curly>{\s*(?P<open_curly_trailing_comment>//.*)?)|
     (?P<end_open_curly>{\s*(?P<open_curly_trailing_comment>//.*)?)|
     (?P<trailing_comment>//.*)|
     (?P<trailing_comment>//.*)|
+    (?P<internal_comment>/\*.*\*/)|
+    (?P<string_literal>"([^"]|\\")*"|'([^']|\\')*')|
+    (?P<float_literal>\b(0[xb][0-9a-fA-F']*|[0-9][0-9']*)\.[0-9a-fA-F']*([eEpP][0-9a-fA-F']*)?)|
+    (?P<int_literal>\b(0[xb][0-9a-fA-F']+|[0-9][0-9']*)([eEpP][0-9a-fA-F']*)?)|
+    (?P<symbol>[\[\]{}(),.;]|[-+=!@#$%^&*/?|<>]+)|
+    (?P<keyword>\b(auto|bool|break|case|catch|char|class|const|continue|default|do|double|else|enum|explicit|extern|false|float|for|friend|goto|if|inline|int|long|mutable|namespace|new|nullptr|operator|private|protected|public|return|short|signed|sizeof|static|struct|switch|template|this|throw|true|try|typedef|union|unsigned|using|virtual|void|while)\b)|
     (?P<id>\b\w+\b)
     (?P<id>\b\w+\b)
 """,
 """,
     re.X,
     re.X,
@@ -44,6 +50,23 @@ class Stats:
     comment_line_widths: Counter[int] = field(default_factory=lambda: Counter())
     comment_line_widths: Counter[int] = field(default_factory=lambda: Counter())
     lines_with_trailing_comments: int = 0
     lines_with_trailing_comments: int = 0
     classes: int = 0
     classes: int = 0
+    internal_comments: int = 0
+    string_literals: int = 0
+    string_literals_per_line: Counter[int] = field(
+        default_factory=lambda: Counter()
+    )
+    int_literals: int = 0
+    int_literals_per_line: Counter[int] = field(
+        default_factory=lambda: Counter()
+    )
+    float_literals: int = 0
+    float_literals_per_line: Counter[int] = field(
+        default_factory=lambda: Counter()
+    )
+    symbols: int = 0
+    symbols_per_line: Counter[int] = field(default_factory=lambda: Counter())
+    keywords: int = 0
+    keywords_per_line: Counter[int] = field(default_factory=lambda: Counter())
     identifiers: int = 0
     identifiers: int = 0
     identifier_widths: Counter[int] = field(default_factory=lambda: Counter())
     identifier_widths: Counter[int] = field(default_factory=lambda: Counter())
     ids_per_line: Counter[int] = field(default_factory=lambda: Counter())
     ids_per_line: Counter[int] = field(default_factory=lambda: Counter())
@@ -56,6 +79,17 @@ class Stats:
         self.comment_line_widths.update(other.comment_line_widths)
         self.comment_line_widths.update(other.comment_line_widths)
         self.lines_with_trailing_comments += other.lines_with_trailing_comments
         self.lines_with_trailing_comments += other.lines_with_trailing_comments
         self.classes += other.classes
         self.classes += other.classes
+        self.internal_comments += other.internal_comments
+        self.string_literals += other.string_literals
+        self.string_literals_per_line.update(other.string_literals_per_line)
+        self.int_literals += other.int_literals
+        self.int_literals_per_line.update(other.int_literals_per_line)
+        self.float_literals += other.float_literals
+        self.float_literals_per_line.update(other.float_literals_per_line)
+        self.symbols += other.symbols
+        self.symbols_per_line.update(other.symbols_per_line)
+        self.keywords += other.keywords
+        self.keywords_per_line.update(other.keywords_per_line)
         self.identifiers += other.identifiers
         self.identifiers += other.identifiers
         self.identifier_widths.update(other.identifier_widths)
         self.identifier_widths.update(other.identifier_widths)
         self.ids_per_line.update(other.ids_per_line)
         self.ids_per_line.update(other.ids_per_line)
@@ -82,6 +116,11 @@ def scan_file(file: Path) -> Stats:
             else:
             else:
                 stats.comment_line_widths[len(line)] += 1
                 stats.comment_line_widths[len(line)] += 1
             continue
             continue
+        line_string_literals = 0
+        line_int_literals = 0
+        line_float_literals = 0
+        line_symbols = 0
+        line_keywords = 0
         line_identifiers = 0
         line_identifiers = 0
         for m in re.finditer(LINE_RE, line):
         for m in re.finditer(LINE_RE, line):
             if m.group("trailing_comment"):
             if m.group("trailing_comment"):
@@ -89,10 +128,23 @@ def scan_file(file: Path) -> Stats:
                 break
                 break
             if m.group("class_intro"):
             if m.group("class_intro"):
                 stats.classes += 1
                 stats.classes += 1
+                line_keywords += 1
                 line_identifiers += 1
                 line_identifiers += 1
                 stats.identifier_widths[len(m.group("class_name"))] += 1
                 stats.identifier_widths[len(m.group("class_name"))] += 1
             elif m.group("end_open_curly"):
             elif m.group("end_open_curly"):
-                pass
+                line_symbols += 1
+            elif m.group("internal_comment"):
+                stats.internal_comments += 1
+            elif m.group("string_literal"):
+                line_string_literals += 1
+            elif m.group("int_literal"):
+                line_int_literals += 1
+            elif m.group("float_literal"):
+                line_float_literals += 1
+            elif m.group("symbol"):
+                line_symbols += 1
+            elif m.group("keyword"):
+                line_keywords += 1
             else:
             else:
                 assert m.group("id"), "Line is '%s', and match is '%s'" % (
                 assert m.group("id"), "Line is '%s', and match is '%s'" % (
                     line,
                     line,
@@ -100,6 +152,16 @@ def scan_file(file: Path) -> Stats:
                 )
                 )
                 line_identifiers += 1
                 line_identifiers += 1
                 stats.identifier_widths[len(m.group("id"))] += 1
                 stats.identifier_widths[len(m.group("id"))] += 1
+        stats.string_literals += line_string_literals
+        stats.string_literals_per_line[line_string_literals] += 1
+        stats.int_literals += line_int_literals
+        stats.int_literals_per_line[line_int_literals] += 1
+        stats.float_literals += line_float_literals
+        stats.float_literals_per_line[line_float_literals] += 1
+        stats.symbols += line_symbols
+        stats.symbols_per_line[line_symbols] += 1
+        stats.keywords += line_keywords
+        stats.keywords_per_line[line_keywords] += 1
         stats.identifiers += line_identifiers
         stats.identifiers += line_identifiers
         stats.ids_per_line[line_identifiers] += 1
         stats.ids_per_line[line_identifiers] += 1
     return stats
     return stats
@@ -136,19 +198,57 @@ Comment lines: %(comment_lines)d
 Empty comment lines: %(empty_comment_lines)d
 Empty comment lines: %(empty_comment_lines)d
 Lines with trailing comments: %(lines_with_trailing_comments)d
 Lines with trailing comments: %(lines_with_trailing_comments)d
 Classes: %(classes)d
 Classes: %(classes)d
+Internal comments: %(internal_comments)d
+String literals: %(string_literals)d
+Int literals: %(int_literals)d
+Float literals: %(float_literals)d
+Symbols: %(symbols)d
+Keywords: %(keywords)d
 IDs: %(identifiers)d"""
 IDs: %(identifiers)d"""
         % asdict(stats)
         % asdict(stats)
     )
     )
 
 
+    tokens = (
+        stats.string_literals
+        + stats.int_literals
+        + stats.float_literals
+        + stats.symbols
+        + stats.keywords
+        + stats.identifiers
+    )
+    print(
+        f"""
+Fraction of blank lines: {stats.blank_lines / stats.lines}
+Fraction of comment lines: {stats.comment_lines / stats.lines}
+
+Total counted tokens: {tokens}
+Fraction string literals: {stats.string_literals / tokens}
+Fraction int literals: {stats.int_literals / tokens}
+Fraction float literals: {stats.float_literals / tokens}
+Fraction symbols: {stats.symbols / tokens}
+Fraction keywords: {stats.keywords / tokens}
+Fraction IDs: {stats.identifiers / tokens}
+    """
+    )
+
     def print_histogram(
     def print_histogram(
         title: str, data: Dict[int, int], column_format: str
         title: str, data: Dict[int, int], column_format: str
     ) -> None:
     ) -> None:
         print()
         print()
-        print(title)
         key_min = min(data.keys())
         key_min = min(data.keys())
         key_max = max(data.keys()) + 1
         key_max = max(data.keys()) + 1
         values = [data.get(k, 0) for k in range(key_min, key_max)]
         values = [data.get(k, 0) for k in range(key_min, key_max)]
         keys = [column_format % k for k in range(key_min, key_max)]
         keys = [column_format % k for k in range(key_min, key_max)]
+        total = sum(values)
+        median = key_min
+        count = total
+        for k in range(key_min, key_max):
+            count -= data.get(k, 0)
+            if count <= total / 2:
+                median = k
+                break
+
+        print(title + f" (median: {median})")
         fig = tpl.figure()
         fig = tpl.figure()
         fig.barh(values, keys)
         fig.barh(values, keys)
         fig.show()
         fig.show()
@@ -156,6 +256,29 @@ IDs: %(identifiers)d"""
     print_histogram(
     print_histogram(
         "## Comment line widths ##", stats.comment_line_widths, "%d columns"
         "## Comment line widths ##", stats.comment_line_widths, "%d columns"
     )
     )
+
+    print_histogram(
+        "## String literals per line ##",
+        stats.string_literals_per_line,
+        "%d literals",
+    )
+    print_histogram(
+        "## Int literals per line ##",
+        stats.int_literals_per_line,
+        "%d literals",
+    )
+    print_histogram(
+        "## Float literals per line ##",
+        stats.float_literals_per_line,
+        "%d literals",
+    )
+    print_histogram(
+        "## Symbols per line ##", stats.symbols_per_line, "%d symbols"
+    )
+    print_histogram(
+        "## Keywords per line ##", stats.keywords_per_line, "%d keywords"
+    )
+
     print_histogram("## ID widths ##", stats.identifier_widths, "%d characters")
     print_histogram("## ID widths ##", stats.identifier_widths, "%d characters")
     print_histogram("## IDs per line ##", stats.ids_per_line, "%d ids")
     print_histogram("## IDs per line ##", stats.ids_per_line, "%d ids")