1 | /* -*- c++ -*- |
2 | kmime_charfreq.h |
3 | |
4 | KMime, the KDE Internet mail/usenet news message library. |
5 | Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org> |
6 | |
7 | This library is free software; you can redistribute it and/or |
8 | modify it under the terms of the GNU Library General Public |
9 | License as published by the Free Software Foundation; either |
10 | version 2 of the License, or (at your option) any later version. |
11 | |
12 | This library is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | Library General Public License for more details. |
16 | |
17 | You should have received a copy of the GNU Library General Public License |
18 | along with this library; see the file COPYING.LIB. If not, write to |
19 | the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
20 | Boston, MA 02110-1301, USA. |
21 | */ |
22 | /** |
23 | @file |
24 | This file is part of the API for handling @ref MIME data and |
25 | defines the CharFreq class. |
26 | |
27 | @brief |
28 | Defines the CharFreq class. |
29 | |
30 | @authors Marc Mutz \<mutz@kde.org\> |
31 | |
32 | @glossary @anchor Eight-Bit @anchor eight-bit @b 8-bit: |
33 | Data that contains bytes with at least one value greater than 127, or at |
34 | least one NUL byte. |
35 | |
36 | @glossary @anchor Eight-Bit-Binary @anchor eight-bit-binary @b 8-bit-binary: |
37 | Eight-bit data that contains a high percentage of non-ascii values, |
38 | or lines longer than 998 characters, or stray CRs, or NULs. |
39 | |
40 | @glossary @anchor Eight-Bit-Text @anchor eight-bit-text @b 8-bit-text: |
41 | Eight-bit data that contains a high percentage of ascii values, |
42 | no lines longer than 998 characters, no NULs, and either only LFs or |
43 | only CRLFs. |
44 | |
45 | @glossary @anchor Seven-Bit @anchor seven-bit @b 7-Bit: |
46 | Data that contains bytes with all values less than 128, and no NULs. |
47 | |
48 | @glossary @anchor Seven-Bit-Binary @anchor seven-bit-binary @b 7-bit-binary: |
49 | Seven-bit data that contains a high percentage of non-ascii values, |
50 | or lines longer than 998 characters, or stray CRs. |
51 | |
52 | @glossary @anchor Seven-Bit-Text @anchor seven-bit-text @b 7-bit-text: |
53 | Seven-bit data that contains a high percentage of ascii values, |
54 | no lines longer than 998 characters, and either only LFs, or only CRLFs. |
55 | */ |
56 | |
57 | #ifndef __KMIME_CHARFREQ_H__ |
58 | #define __KMIME_CHARFREQ_H__ |
59 | |
60 | #include <QtCore/QByteArray> |
61 | #include "kmime_export.h" |
62 | #undef None |
63 | |
64 | namespace KMime { |
65 | |
66 | /** |
67 | @brief |
68 | A class for performing basic data typing using frequency count heuristics. |
69 | |
70 | This class performs character frequency counts on the provided data which |
71 | are used in heuristics to determine a basic data type. The data types are: |
72 | |
73 | - @ref Eight-Bit-Binary |
74 | - @ref Eight-Bit-Text |
75 | - @ref Seven-Bit-Binary |
76 | - @ref Seven-Bit-Text |
77 | */ |
78 | class KMIME_EXPORT CharFreq |
79 | { |
80 | public: |
81 | /** |
82 | Constructs a Character Frequency instance for a buffer @p buf of |
83 | QByteArray data. |
84 | |
85 | @param buf is a QByteArray containing the data. |
86 | */ |
87 | explicit CharFreq( const QByteArray &buf ); |
88 | |
89 | /** |
90 | Constructs a Character Frequency instance for a buffer @p buf of |
91 | chars of length @p len. |
92 | |
93 | @param buf is a pointer to a character string containing the data. |
94 | @param len is the length of @p buf, in characters. |
95 | */ |
96 | CharFreq( const char *buf, size_t len ); |
97 | |
98 | /** |
99 | The different types of data. |
100 | */ |
101 | enum Type { |
102 | None = 0, /**< Unknown */ |
103 | EightBitData, /**< 8bit binary */ |
104 | Binary = EightBitData, /**< 8bit binary */ |
105 | SevenBitData, /**< 7bit binary */ |
106 | EightBitText, /**< 8bit text */ |
107 | SevenBitText /**< 7bit text */ |
108 | }; |
109 | |
110 | /** |
111 | Returns the data #Type as derived from the class heuristics. |
112 | */ |
113 | Type type() const; |
114 | |
115 | /** |
116 | Returns true if the data #Type is EightBitData; false otherwise. |
117 | */ |
118 | bool isEightBitData() const; |
119 | |
120 | /** |
121 | Returns true if the data #Type is EightBitText; false otherwise. |
122 | */ |
123 | bool isEightBitText() const; |
124 | |
125 | /** |
126 | Returns true if the data #Type is SevenBitData; false otherwise. |
127 | */ |
128 | bool isSevenBitData() const; |
129 | |
130 | /** |
131 | Returns true if the data #Type is SevenBitText; false otherwise. |
132 | */ |
133 | bool isSevenBitText() const; |
134 | |
135 | /** |
136 | Returns true if the data contains trailing whitespace. i.e., |
137 | if any line ends with space (' ') or tab ('\\t'). |
138 | */ |
139 | bool hasTrailingWhitespace() const; |
140 | |
141 | /** |
142 | Returns true if the data contains a line that starts with "From ". |
143 | */ |
144 | bool hasLeadingFrom() const; |
145 | |
146 | /** |
147 | Returns the percentage of printable characters in the data. |
148 | The result is undefined if the number of data characters is zero. |
149 | */ |
150 | float printableRatio() const; |
151 | |
152 | /** |
153 | Returns the percentage of control code characters (CTLs) in the data. |
154 | The result is undefined if the number of data characters is zero. |
155 | */ |
156 | float controlCodesRatio() const; |
157 | |
158 | private: |
159 | //@cond PRIVATE |
160 | uint mNUL; // count of NUL chars |
161 | uint mCTL; // count of CTLs (incl. DEL, excl. CR, LF, HT) |
162 | uint mCR; // count of CR chars |
163 | uint mLF; // count of LF chars |
164 | uint mCRLF; // count of LFs, preceded by CRs |
165 | uint mPrintable; // count of printable US-ASCII chars (SPC..~) |
166 | uint mEightBit; // count of other latin1 chars (those with 8th bit set) |
167 | uint mTotal; // count of all chars |
168 | uint mLineMin; // minimum line length |
169 | uint mLineMax; // maximum line length |
170 | bool mTrailingWS; // does the buffer contain trailing whitespace? |
171 | bool mLeadingFrom; // does the buffer contain lines starting with "From "? |
172 | //@endcond |
173 | |
174 | /** |
175 | Performs the character frequency counts on the data. |
176 | |
177 | @param buf is a pointer to a character string containing the data. |
178 | @param len is the length of @p buf, in characters. |
179 | */ |
180 | void count( const char *buf, size_t len ); |
181 | }; |
182 | |
183 | } // namespace KMime |
184 | |
185 | #endif /* __KMIME_CHARFREQ_H__ */ |
186 | |