1//===--- MisleadingBidirectional.cpp - clang-tidy -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "MisleadingBidirectional.h"
10
11#include "clang/Frontend/CompilerInstance.h"
12#include "clang/Lex/Preprocessor.h"
13#include "llvm/Support/ConvertUTF.h"
14#include <optional>
15
16using namespace clang;
17using namespace clang::tidy::misc;
18
19static bool containsMisleadingBidi(StringRef Buffer,
20 bool HonorLineBreaks = true) {
21 const char *CurPtr = Buffer.begin();
22
23 enum BidiChar {
24 PS = 0x2029,
25 RLO = 0x202E,
26 RLE = 0x202B,
27 LRO = 0x202D,
28 LRE = 0x202A,
29 PDF = 0x202C,
30 RLI = 0x2067,
31 LRI = 0x2066,
32 FSI = 0x2068,
33 PDI = 0x2069
34 };
35
36 SmallVector<BidiChar> BidiContexts;
37
38 // Scan each character while maintaining a stack of opened bidi context.
39 // RLO/RLE/LRO/LRE all are closed by PDF while RLI LRI and FSI are closed by
40 // PDI. New lines reset the context count. Extra PDF / PDI are ignored.
41 //
42 // Warn if we end up with an unclosed context.
43 while (CurPtr < Buffer.end()) {
44 unsigned char C = *CurPtr;
45 if (isASCII(c: C)) {
46 ++CurPtr;
47 bool IsParagrapSep =
48 (C == 0xA || C == 0xD || (0x1C <= C && C <= 0x1E) || C == 0x85);
49 bool IsSegmentSep = (C == 0x9 || C == 0xB || C == 0x1F);
50 if (IsParagrapSep || IsSegmentSep)
51 BidiContexts.clear();
52 continue;
53 }
54 llvm::UTF32 CodePoint = 0;
55 llvm::ConversionResult Result = llvm::convertUTF8Sequence(
56 source: (const llvm::UTF8 **)&CurPtr, sourceEnd: (const llvm::UTF8 *)Buffer.end(),
57 target: &CodePoint, flags: llvm::strictConversion);
58
59 // If conversion fails, utf-8 is designed so that we can just try next char.
60 if (Result != llvm::conversionOK) {
61 ++CurPtr;
62 continue;
63 }
64
65 // Open a PDF context.
66 if (CodePoint == RLO || CodePoint == RLE || CodePoint == LRO ||
67 CodePoint == LRE)
68 BidiContexts.push_back(Elt: PDF);
69 // Close PDF Context.
70 else if (CodePoint == PDF) {
71 if (!BidiContexts.empty() && BidiContexts.back() == PDF)
72 BidiContexts.pop_back();
73 }
74 // Open a PDI Context.
75 else if (CodePoint == RLI || CodePoint == LRI || CodePoint == FSI)
76 BidiContexts.push_back(Elt: PDI);
77 // Close a PDI Context.
78 else if (CodePoint == PDI) {
79 auto R = llvm::find(Range: llvm::reverse(C&: BidiContexts), Val: PDI);
80 if (R != BidiContexts.rend())
81 BidiContexts.resize(N: BidiContexts.rend() - R - 1);
82 }
83 // Line break or equivalent
84 else if (CodePoint == PS)
85 BidiContexts.clear();
86 }
87 return !BidiContexts.empty();
88}
89
90class MisleadingBidirectionalCheck::MisleadingBidirectionalHandler
91 : public CommentHandler {
92public:
93 MisleadingBidirectionalHandler(MisleadingBidirectionalCheck &Check)
94 : Check(Check) {}
95
96 bool HandleComment(Preprocessor &PP, SourceRange Range) override {
97 // FIXME: check that we are in a /* */ comment
98 StringRef Text =
99 Lexer::getSourceText(Range: CharSourceRange::getCharRange(R: Range),
100 SM: PP.getSourceManager(), LangOpts: PP.getLangOpts());
101
102 if (containsMisleadingBidi(Buffer: Text, HonorLineBreaks: true))
103 Check.diag(
104 Loc: Range.getBegin(),
105 Description: "comment contains misleading bidirectional Unicode characters");
106 return false;
107 }
108
109private:
110 MisleadingBidirectionalCheck &Check;
111};
112
113MisleadingBidirectionalCheck::MisleadingBidirectionalCheck(
114 StringRef Name, ClangTidyContext *Context)
115 : ClangTidyCheck(Name, Context),
116 Handler(std::make_unique<MisleadingBidirectionalHandler>(args&: *this)) {}
117
118MisleadingBidirectionalCheck::~MisleadingBidirectionalCheck() = default;
119
120void MisleadingBidirectionalCheck::registerPPCallbacks(
121 const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
122 PP->addCommentHandler(Handler: Handler.get());
123}
124
125void MisleadingBidirectionalCheck::check(
126 const ast_matchers::MatchFinder::MatchResult &Result) {
127 if (const auto *SL = Result.Nodes.getNodeAs<StringLiteral>(ID: "strlit")) {
128 StringRef Literal = SL->getBytes();
129 if (containsMisleadingBidi(Buffer: Literal, HonorLineBreaks: false))
130 diag(Loc: SL->getBeginLoc(), Description: "string literal contains misleading "
131 "bidirectional Unicode characters");
132 }
133}
134
135void MisleadingBidirectionalCheck::registerMatchers(
136 ast_matchers::MatchFinder *Finder) {
137 Finder->addMatcher(NodeMatch: ast_matchers::stringLiteral().bind(ID: "strlit"), Action: this);
138}
139

source code of clang-tools-extra/clang-tidy/misc/MisleadingBidirectional.cpp