1/* -*- c++ -*-
2 kmime_charfreq.h
3
4 KMime, the KDE Internet mail/usenet news message library.
5 Copyright (c) 2001-2002 Marc Mutz <mutz@kde.org>
6
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public
9 License as published by the Free Software Foundation; either
10 version 2 of the License, or (at your option) any later version.
11
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
16
17 You should have received a copy of the GNU Library General Public License
18 along with this library; see the file COPYING.LIB. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA.
21*/
22/**
23 @file
24 This file is part of the API for handling @ref MIME data and
25 defines the CharFreq class.
26
27 @brief
28 Defines the CharFreq class.
29
30 @authors Marc Mutz \<mutz@kde.org\>
31
32 @glossary @anchor Eight-Bit @anchor eight-bit @b 8-bit:
33 Data that contains bytes with at least one value greater than 127, or at
34 least one NUL byte.
35
36 @glossary @anchor Eight-Bit-Binary @anchor eight-bit-binary @b 8-bit-binary:
37 Eight-bit data that contains a high percentage of non-ascii values,
38 or lines longer than 998 characters, or stray CRs, or NULs.
39
40 @glossary @anchor Eight-Bit-Text @anchor eight-bit-text @b 8-bit-text:
41 Eight-bit data that contains a high percentage of ascii values,
42 no lines longer than 998 characters, no NULs, and either only LFs or
43 only CRLFs.
44
45 @glossary @anchor Seven-Bit @anchor seven-bit @b 7-Bit:
46 Data that contains bytes with all values less than 128, and no NULs.
47
48 @glossary @anchor Seven-Bit-Binary @anchor seven-bit-binary @b 7-bit-binary:
49 Seven-bit data that contains a high percentage of non-ascii values,
50 or lines longer than 998 characters, or stray CRs.
51
52 @glossary @anchor Seven-Bit-Text @anchor seven-bit-text @b 7-bit-text:
53 Seven-bit data that contains a high percentage of ascii values,
54 no lines longer than 998 characters, and either only LFs, or only CRLFs.
55*/
56
57#ifndef __KMIME_CHARFREQ_H__
58#define __KMIME_CHARFREQ_H__
59
60#include <QtCore/QByteArray>
61#include "kmime_export.h"
62#undef None
63
64namespace KMime {
65
66/**
67 @brief
68 A class for performing basic data typing using frequency count heuristics.
69
70 This class performs character frequency counts on the provided data which
71 are used in heuristics to determine a basic data type. The data types are:
72
73 - @ref Eight-Bit-Binary
74 - @ref Eight-Bit-Text
75 - @ref Seven-Bit-Binary
76 - @ref Seven-Bit-Text
77*/
78class KMIME_EXPORT CharFreq
79{
80 public:
81 /**
82 Constructs a Character Frequency instance for a buffer @p buf of
83 QByteArray data.
84
85 @param buf is a QByteArray containing the data.
86 */
87 explicit CharFreq( const QByteArray &buf );
88
89 /**
90 Constructs a Character Frequency instance for a buffer @p buf of
91 chars of length @p len.
92
93 @param buf is a pointer to a character string containing the data.
94 @param len is the length of @p buf, in characters.
95 */
96 CharFreq( const char *buf, size_t len );
97
98 /**
99 The different types of data.
100 */
101 enum Type {
102 None = 0, /**< Unknown */
103 EightBitData, /**< 8bit binary */
104 Binary = EightBitData, /**< 8bit binary */
105 SevenBitData, /**< 7bit binary */
106 EightBitText, /**< 8bit text */
107 SevenBitText /**< 7bit text */
108 };
109
110 /**
111 Returns the data #Type as derived from the class heuristics.
112 */
113 Type type() const;
114
115 /**
116 Returns true if the data #Type is EightBitData; false otherwise.
117 */
118 bool isEightBitData() const;
119
120 /**
121 Returns true if the data #Type is EightBitText; false otherwise.
122 */
123 bool isEightBitText() const;
124
125 /**
126 Returns true if the data #Type is SevenBitData; false otherwise.
127 */
128 bool isSevenBitData() const;
129
130 /**
131 Returns true if the data #Type is SevenBitText; false otherwise.
132 */
133 bool isSevenBitText() const;
134
135 /**
136 Returns true if the data contains trailing whitespace. i.e.,
137 if any line ends with space (' ') or tab ('\\t').
138 */
139 bool hasTrailingWhitespace() const;
140
141 /**
142 Returns true if the data contains a line that starts with "From ".
143 */
144 bool hasLeadingFrom() const;
145
146 /**
147 Returns the percentage of printable characters in the data.
148 The result is undefined if the number of data characters is zero.
149 */
150 float printableRatio() const;
151
152 /**
153 Returns the percentage of control code characters (CTLs) in the data.
154 The result is undefined if the number of data characters is zero.
155 */
156 float controlCodesRatio() const;
157
158 private:
159 //@cond PRIVATE
160 uint mNUL; // count of NUL chars
161 uint mCTL; // count of CTLs (incl. DEL, excl. CR, LF, HT)
162 uint mCR; // count of CR chars
163 uint mLF; // count of LF chars
164 uint mCRLF; // count of LFs, preceded by CRs
165 uint mPrintable; // count of printable US-ASCII chars (SPC..~)
166 uint mEightBit; // count of other latin1 chars (those with 8th bit set)
167 uint mTotal; // count of all chars
168 uint mLineMin; // minimum line length
169 uint mLineMax; // maximum line length
170 bool mTrailingWS; // does the buffer contain trailing whitespace?
171 bool mLeadingFrom; // does the buffer contain lines starting with "From "?
172 //@endcond
173
174 /**
175 Performs the character frequency counts on the data.
176
177 @param buf is a pointer to a character string containing the data.
178 @param len is the length of @p buf, in characters.
179 */
180 void count( const char *buf, size_t len );
181};
182
183} // namespace KMime
184
185#endif /* __KMIME_CHARFREQ_H__ */
186