1 | /* |
2 | kmime_charfreq.cpp |
3 | |
4 | KMime, the KDE Internet mail/usenet news message library. |
5 | Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org> |
6 | |
7 | This library is free software; you can redistribute it and/or |
8 | modify it under the terms of the GNU Library General Public |
9 | License as published by the Free Software Foundation; either |
10 | version 2 of the License, or (at your option) any later version. |
11 | |
12 | This library is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | Library General Public License for more details. |
16 | |
17 | You should have received a copy of the GNU Library General Public License |
18 | along with this library; see the file COPYING.LIB. If not, write to |
19 | the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
20 | Boston, MA 02110-1301, USA. |
21 | */ |
22 | |
23 | /** |
24 | @file |
25 | This file is part of the API for handling MIME data and |
26 | defines the CharFreq class. |
27 | |
28 | @brief |
29 | Defines the CharFreq class. |
30 | |
31 | @authors Marc Mutz \<mutz@kde.org\> |
32 | */ |
33 | |
34 | #include "kmime_charfreq.h" |
35 | |
36 | using namespace KMime; |
37 | |
38 | /** |
39 | * Private class that helps to provide binary compatibility between releases. |
40 | * @internal |
41 | */ |
42 | //@cond PRIVATE |
43 | //class KMime::CharFreq::Private |
44 | //{ |
45 | // public: |
46 | //}; |
47 | //@endcond |
48 | |
49 | CharFreq::CharFreq( const QByteArray &buf ) |
50 | : mNUL( 0 ), |
51 | mCTL( 0 ), |
52 | mCR( 0 ), mLF( 0 ), |
53 | mCRLF( 0 ), |
54 | mPrintable( 0 ), |
55 | mEightBit( 0 ), |
56 | mTotal( 0 ), |
57 | mLineMin( 0xffffffff ), |
58 | mLineMax( 0 ), |
59 | mTrailingWS( false ), |
60 | mLeadingFrom( false ) |
61 | { |
62 | if ( !buf.isEmpty() ) { |
63 | count( buf.data(), buf.size() ); |
64 | } |
65 | } |
66 | |
67 | CharFreq::CharFreq( const char *buf, size_t len ) |
68 | : mNUL( 0 ), |
69 | mCTL( 0 ), |
70 | mCR( 0 ), mLF( 0 ), |
71 | mCRLF( 0 ), |
72 | mPrintable( 0 ), |
73 | mEightBit( 0 ), |
74 | mTotal( 0 ), |
75 | mLineMin( 0xffffffff ), |
76 | mLineMax( 0 ), |
77 | mTrailingWS( false ), |
78 | mLeadingFrom( false ) |
79 | { |
80 | if ( buf && len > 0 ) { |
81 | count( buf, len ); |
82 | } |
83 | } |
84 | |
85 | //@cond PRIVATE |
86 | static inline bool isWS( char ch ) |
87 | { |
88 | return ( ch == '\t' || ch == ' ' ); |
89 | } |
90 | //@endcond |
91 | |
92 | void CharFreq::count( const char *it, size_t len ) |
93 | { |
94 | const char *end = it + len; |
95 | uint currentLineLength = 0; |
96 | // initialize the prevChar with LF so that From_ detection works w/o |
97 | // special-casing: |
98 | char prevChar = '\n'; |
99 | char prevPrevChar = 0; |
100 | |
101 | for ( ; it != end ; ++it ) { |
102 | ++currentLineLength; |
103 | switch ( *it ) { |
104 | case '\0': ++mNUL; break; |
105 | case '\r': ++mCR; break; |
106 | case '\n': ++mLF; |
107 | if ( prevChar == '\r' ) { |
108 | --currentLineLength; ++mCRLF; |
109 | } |
110 | if ( currentLineLength >= mLineMax ) { |
111 | mLineMax = currentLineLength-1; |
112 | } |
113 | if ( currentLineLength <= mLineMin ) { |
114 | mLineMin = currentLineLength-1; |
115 | } |
116 | if ( !mTrailingWS ) { |
117 | if ( isWS( prevChar ) || |
118 | ( prevChar == '\r' && isWS( prevPrevChar ) ) ) { |
119 | mTrailingWS = true; |
120 | } |
121 | } |
122 | currentLineLength = 0; |
123 | break; |
124 | case 'F': // check for lines starting with From_ if not found already: |
125 | if ( !mLeadingFrom ) { |
126 | if ( prevChar == '\n' && end - it >= 5 && |
127 | !qstrncmp( "From " , it, 5 ) ) { |
128 | mLeadingFrom = true; |
129 | } |
130 | } |
131 | ++mPrintable; |
132 | break; |
133 | default: |
134 | { |
135 | uchar c = *it; |
136 | if ( c == '\t' || ( c >= ' ' && c <= '~' ) ) { |
137 | ++mPrintable; |
138 | } else if ( c == 127 || c < ' ' ) { |
139 | ++mCTL; |
140 | } else { |
141 | ++mEightBit; |
142 | } |
143 | } |
144 | } |
145 | prevPrevChar = prevChar; |
146 | prevChar = *it; |
147 | } |
148 | |
149 | // consider the length of the last line |
150 | if ( currentLineLength >= mLineMax ) { |
151 | mLineMax = currentLineLength; |
152 | } |
153 | if ( currentLineLength <= mLineMin ) { |
154 | mLineMin = currentLineLength; |
155 | } |
156 | |
157 | // check whether the last character is tab or space |
158 | if ( isWS( prevChar ) ) { |
159 | mTrailingWS = true; |
160 | } |
161 | |
162 | mTotal = len; |
163 | } |
164 | |
165 | bool CharFreq::isEightBitData() const |
166 | { |
167 | return type() == EightBitData; |
168 | } |
169 | |
170 | bool CharFreq::isEightBitText() const |
171 | { |
172 | return type() == EightBitText; |
173 | } |
174 | |
175 | bool CharFreq::isSevenBitData() const |
176 | { |
177 | return type() == SevenBitData; |
178 | } |
179 | |
180 | bool CharFreq::isSevenBitText() const |
181 | { |
182 | return type() == SevenBitText; |
183 | } |
184 | |
185 | bool CharFreq::hasTrailingWhitespace() const |
186 | { |
187 | return mTrailingWS; |
188 | } |
189 | |
190 | bool CharFreq::hasLeadingFrom() const |
191 | { |
192 | return mLeadingFrom; |
193 | } |
194 | |
195 | CharFreq::Type CharFreq::type() const |
196 | { |
197 | #if 0 |
198 | qDebug( "Total: %d; NUL: %d; CTL: %d;\n" |
199 | "CR: %d; LF: %d; CRLF: %d;\n" |
200 | "lineMin: %d; lineMax: %d;\n" |
201 | "printable: %d; eightBit: %d;\n" |
202 | "trailing whitespace: %s;\n" |
203 | "leading 'From ': %s;\n" , |
204 | total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax, |
205 | printable, eightBit, |
206 | mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no" ); |
207 | #endif |
208 | if ( mNUL ) { // must be binary |
209 | return Binary; |
210 | } |
211 | |
212 | // doesn't contain NUL's: |
213 | if ( mEightBit ) { |
214 | if ( mLineMax > 988 ) { |
215 | return EightBitData; // not allowed in 8bit |
216 | } |
217 | if ( ( mLF != mCRLF && mCRLF > 0 ) || mCR != mCRLF || controlCodesRatio() > 0.2 ) { |
218 | return EightBitData; |
219 | } |
220 | return EightBitText; |
221 | } |
222 | |
223 | // doesn't contain NUL's, nor 8bit chars: |
224 | if ( mLineMax > 988 ) { |
225 | return SevenBitData; |
226 | } |
227 | if ( ( mLF != mCRLF && mCRLF > 0 ) || mCR != mCRLF || controlCodesRatio() > 0.2 ) { |
228 | return SevenBitData; |
229 | } |
230 | |
231 | // no NUL, no 8bit chars, no excessive CTLs and no lines > 998 chars: |
232 | return SevenBitText; |
233 | } |
234 | |
235 | float CharFreq::printableRatio() const |
236 | { |
237 | if ( mTotal ) { |
238 | return float( mPrintable ) / float( mTotal ); |
239 | } else { |
240 | return 0; |
241 | } |
242 | } |
243 | |
244 | float CharFreq::controlCodesRatio() const |
245 | { |
246 | if ( mTotal ) { |
247 | return float( mCTL ) / float( mTotal ); |
248 | } else { |
249 | return 0; |
250 | } |
251 | } |
252 | |
253 | |