1/*
2 kmime_charfreq.cpp
3
4 KMime, the KDE Internet mail/usenet news message library.
5 Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
6
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public
9 License as published by the Free Software Foundation; either
10 version 2 of the License, or (at your option) any later version.
11
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
16
17 You should have received a copy of the GNU Library General Public License
18 along with this library; see the file COPYING.LIB. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA.
21*/
22
23/**
24 @file
25 This file is part of the API for handling MIME data and
26 defines the CharFreq class.
27
28 @brief
29 Defines the CharFreq class.
30
31 @authors Marc Mutz \<mutz@kde.org\>
32*/
33
34#include "kmime_charfreq.h"
35
36using namespace KMime;
37
38/**
39 * Private class that helps to provide binary compatibility between releases.
40 * @internal
41 */
42//@cond PRIVATE
43//class KMime::CharFreq::Private
44//{
45// public:
46//};
47//@endcond
48
49CharFreq::CharFreq( const QByteArray &buf )
50 : mNUL( 0 ),
51 mCTL( 0 ),
52 mCR( 0 ), mLF( 0 ),
53 mCRLF( 0 ),
54 mPrintable( 0 ),
55 mEightBit( 0 ),
56 mTotal( 0 ),
57 mLineMin( 0xffffffff ),
58 mLineMax( 0 ),
59 mTrailingWS( false ),
60 mLeadingFrom( false )
61{
62 if ( !buf.isEmpty() ) {
63 count( buf.data(), buf.size() );
64 }
65}
66
67CharFreq::CharFreq( const char *buf, size_t len )
68 : mNUL( 0 ),
69 mCTL( 0 ),
70 mCR( 0 ), mLF( 0 ),
71 mCRLF( 0 ),
72 mPrintable( 0 ),
73 mEightBit( 0 ),
74 mTotal( 0 ),
75 mLineMin( 0xffffffff ),
76 mLineMax( 0 ),
77 mTrailingWS( false ),
78 mLeadingFrom( false )
79{
80 if ( buf && len > 0 ) {
81 count( buf, len );
82 }
83}
84
85//@cond PRIVATE
86static inline bool isWS( char ch )
87{
88 return ( ch == '\t' || ch == ' ' );
89}
90//@endcond
91
92void CharFreq::count( const char *it, size_t len )
93{
94 const char *end = it + len;
95 uint currentLineLength = 0;
96 // initialize the prevChar with LF so that From_ detection works w/o
97 // special-casing:
98 char prevChar = '\n';
99 char prevPrevChar = 0;
100
101 for ( ; it != end ; ++it ) {
102 ++currentLineLength;
103 switch ( *it ) {
104 case '\0': ++mNUL; break;
105 case '\r': ++mCR; break;
106 case '\n': ++mLF;
107 if ( prevChar == '\r' ) {
108 --currentLineLength; ++mCRLF;
109 }
110 if ( currentLineLength >= mLineMax ) {
111 mLineMax = currentLineLength-1;
112 }
113 if ( currentLineLength <= mLineMin ) {
114 mLineMin = currentLineLength-1;
115 }
116 if ( !mTrailingWS ) {
117 if ( isWS( prevChar ) ||
118 ( prevChar == '\r' && isWS( prevPrevChar ) ) ) {
119 mTrailingWS = true;
120 }
121 }
122 currentLineLength = 0;
123 break;
124 case 'F': // check for lines starting with From_ if not found already:
125 if ( !mLeadingFrom ) {
126 if ( prevChar == '\n' && end - it >= 5 &&
127 !qstrncmp( "From ", it, 5 ) ) {
128 mLeadingFrom = true;
129 }
130 }
131 ++mPrintable;
132 break;
133 default:
134 {
135 uchar c = *it;
136 if ( c == '\t' || ( c >= ' ' && c <= '~' ) ) {
137 ++mPrintable;
138 } else if ( c == 127 || c < ' ' ) {
139 ++mCTL;
140 } else {
141 ++mEightBit;
142 }
143 }
144 }
145 prevPrevChar = prevChar;
146 prevChar = *it;
147 }
148
149 // consider the length of the last line
150 if ( currentLineLength >= mLineMax ) {
151 mLineMax = currentLineLength;
152 }
153 if ( currentLineLength <= mLineMin ) {
154 mLineMin = currentLineLength;
155 }
156
157 // check whether the last character is tab or space
158 if ( isWS( prevChar ) ) {
159 mTrailingWS = true;
160 }
161
162 mTotal = len;
163}
164
165bool CharFreq::isEightBitData() const
166{
167 return type() == EightBitData;
168}
169
170bool CharFreq::isEightBitText() const
171{
172 return type() == EightBitText;
173}
174
175bool CharFreq::isSevenBitData() const
176{
177 return type() == SevenBitData;
178}
179
180bool CharFreq::isSevenBitText() const
181{
182 return type() == SevenBitText;
183}
184
185bool CharFreq::hasTrailingWhitespace() const
186{
187 return mTrailingWS;
188}
189
190bool CharFreq::hasLeadingFrom() const
191{
192 return mLeadingFrom;
193}
194
195CharFreq::Type CharFreq::type() const
196{
197#if 0
198 qDebug( "Total: %d; NUL: %d; CTL: %d;\n"
199 "CR: %d; LF: %d; CRLF: %d;\n"
200 "lineMin: %d; lineMax: %d;\n"
201 "printable: %d; eightBit: %d;\n"
202 "trailing whitespace: %s;\n"
203 "leading 'From ': %s;\n",
204 total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax,
205 printable, eightBit,
206 mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no" );
207#endif
208 if ( mNUL ) { // must be binary
209 return Binary;
210 }
211
212 // doesn't contain NUL's:
213 if ( mEightBit ) {
214 if ( mLineMax > 988 ) {
215 return EightBitData; // not allowed in 8bit
216 }
217 if ( ( mLF != mCRLF && mCRLF > 0 ) || mCR != mCRLF || controlCodesRatio() > 0.2 ) {
218 return EightBitData;
219 }
220 return EightBitText;
221 }
222
223 // doesn't contain NUL's, nor 8bit chars:
224 if ( mLineMax > 988 ) {
225 return SevenBitData;
226 }
227 if ( ( mLF != mCRLF && mCRLF > 0 ) || mCR != mCRLF || controlCodesRatio() > 0.2 ) {
228 return SevenBitData;
229 }
230
231 // no NUL, no 8bit chars, no excessive CTLs and no lines > 998 chars:
232 return SevenBitText;
233}
234
235float CharFreq::printableRatio() const
236{
237 if ( mTotal ) {
238 return float( mPrintable ) / float( mTotal );
239 } else {
240 return 0;
241 }
242}
243
244float CharFreq::controlCodesRatio() const
245{
246 if ( mTotal ) {
247 return float( mCTL ) / float( mTotal );
248 } else {
249 return 0;
250 }
251}
252
253