Bug #662 » exiv2-exifcomment-unicode.patch
src/convert.cpp (working copy) | ||
---|---|---|
1325 | 1325 |
std::string outstr; |
1326 | 1326 |
EXV_ICONV_CONST char *inptr = const_cast<char *>(str.c_str()); |
1327 | 1327 |
size_t inbytesleft = str.length(); |
1328 | ||
1328 |
int outbytesProduced = 0; |
|
1329 | 1329 |
while (inbytesleft) { |
1330 | 1330 |
char outbuf[100]; |
1331 | 1331 |
char *outptr = outbuf; |
... | ... | |
1335 | 1335 |
&inbytesleft, |
1336 | 1336 |
&outptr, |
1337 | 1337 |
&outbytesleft); |
1338 |
outbytesProduced += sizeof(outbuf) - 1 - outbytesleft; |
|
1338 | 1339 |
if (rc == size_t(-1) && errno != E2BIG) { |
1339 | 1340 |
#ifndef SUPPRESS_WARNINGS |
1340 | 1341 |
std::cerr << "Warning: iconv: " |
... | ... | |
1345 | 1346 |
break; |
1346 | 1347 |
} |
1347 | 1348 |
*outptr = '\0'; |
1348 |
outstr.append(outbuf);
|
|
1349 |
outstr.append(std::string(outbuf, outbytesProduced));
|
|
1349 | 1350 |
} |
1350 | 1351 |
if (cd != (iconv_t)(-1)) { |
1351 | 1352 |
iconv_close(cd); |
src/exiv2.cpp (working copy) | ||
---|---|---|
41 | 41 |
#include "exiv2.hpp" |
42 | 42 |
#include "actions.hpp" |
43 | 43 |
#include "utils.hpp" |
44 |
#include "convert.hpp" |
|
44 | 45 |
#include "i18n.h" // NLS support. |
45 | 46 |
#include "xmp.hpp" |
46 | 47 | |
... | ... | |
117 | 118 |
*/ |
118 | 119 |
bool parseLine(ModifyCmd& modifyCmd, |
119 | 120 |
const std::string& line, int num); |
120 | ||
121 |
|
|
122 |
/*! |
|
123 |
@brief Parses a string containing backslash-escapes |
|
124 |
@param input Input string, assumed to be UTF-8 |
|
125 |
*/ |
|
126 |
std::string parseEscapes(const std::string& input); |
|
121 | 127 |
} |
122 | 128 | |
123 | 129 |
// ***************************************************************************** |
... | ... | |
1076 | 1082 |
} |
1077 | 1083 |
} |
1078 | 1084 | |
1079 |
value = line.substr(valStart, valEnd+1-valStart);
|
|
1085 |
value = parseEscapes(line.substr(valStart, valEnd+1-valStart));
|
|
1080 | 1086 |
std::string::size_type last = value.length()-1; |
1081 | 1087 |
if ( (value[0] == '"' && value[last] == '"') |
1082 | 1088 |
|| (value[0] == '\'' && value[last] == '\'')) { |
... | ... | |
1108 | 1114 |
return cmdIdAndString[i].cmdId_; |
1109 | 1115 |
} |
1110 | 1116 | |
1117 |
std::string parseEscapes(const std::string& input) |
|
1118 |
{ |
|
1119 |
std::string result = ""; |
|
1120 |
for (unsigned int i = 0; i < input.length(); ++i) { |
|
1121 |
char ch = input[i]; |
|
1122 |
if (ch == '\\') { |
|
1123 |
int escapeStart = i; |
|
1124 |
if (input.length() - 1 > i) { |
|
1125 |
++i; |
|
1126 |
ch = input[i]; |
|
1127 |
switch (ch) { |
|
1128 |
// Escaping of backslash |
|
1129 |
case '\\': |
|
1130 |
result.push_back('\\'); |
|
1131 |
break; |
|
1132 |
|
|
1133 |
// Escaping of newline |
|
1134 |
case 'n': |
|
1135 |
result.push_back('\n'); |
|
1136 |
break; |
|
1137 |
|
|
1138 |
// Escaping of tab |
|
1139 |
case 't': |
|
1140 |
result.push_back('\t'); |
|
1141 |
break; |
|
1142 |
|
|
1143 |
// Escaping of unicode |
|
1144 |
case 'u': |
|
1145 |
if (input.length() - 4 > i) { |
|
1146 |
int acc = 0; |
|
1147 |
for (int j = 0; j < 4; ++j) { |
|
1148 |
++i; |
|
1149 |
acc <<= 4; |
|
1150 |
if (input[i] >= '0' && input[i] <= '9') { |
|
1151 |
acc |= input[i] - '0'; |
|
1152 |
} else if (input[i] >= 'a' && input[i] <= 'f') { |
|
1153 |
acc |= input[i] - 'a' + 10; |
|
1154 |
} else if (input[i] >= 'A' && input[i] <= 'F') { |
|
1155 |
acc |= input[i] - 'A' + 10; |
|
1156 |
} else { |
|
1157 |
acc = -1; |
|
1158 |
break; |
|
1159 |
} |
|
1160 |
} |
|
1161 |
if (acc == -1) { |
|
1162 |
result.push_back('\\'); |
|
1163 |
i = escapeStart; |
|
1164 |
break; |
|
1165 |
} |
|
1166 |
|
|
1167 |
std::string ucs2toUtf8 = ""; |
|
1168 |
ucs2toUtf8.push_back((char) ((acc & 0xff00) >> 8)); |
|
1169 |
ucs2toUtf8.push_back((char) (acc & 0x00ff)); |
|
1170 |
|
|
1171 |
if (Exiv2::convertStringCharset (ucs2toUtf8, "UCS-2BE", "UTF-8")) { |
|
1172 |
result.append (ucs2toUtf8); |
|
1173 |
} |
|
1174 |
} else { |
|
1175 |
result.push_back('\\'); |
|
1176 |
result.push_back(ch); |
|
1177 |
} |
|
1178 |
break; |
|
1179 |
|
|
1180 |
default: |
|
1181 |
result.push_back('\\'); |
|
1182 |
result.push_back(ch); |
|
1183 |
} |
|
1184 |
} else { |
|
1185 |
result.push_back(ch); |
|
1186 |
} |
|
1187 |
} else { |
|
1188 |
result.push_back(ch); |
|
1189 |
} |
|
1190 |
} |
|
1191 |
|
|
1192 |
return result; |
|
1193 |
} |
|
1194 |
|
|
1111 | 1195 |
} |
1196 |
|
src/value.cpp (working copy) | ||
---|---|---|
34 | 34 |
// included header files |
35 | 35 |
#include "value.hpp" |
36 | 36 |
#include "types.hpp" |
37 |
#include "convert.hpp" |
|
37 | 38 |
#include "error.hpp" |
38 | 39 | |
39 | 40 |
// + standard includes |
... | ... | |
457 | 458 |
int CommentValue::read(const std::string& comment) |
458 | 459 |
{ |
459 | 460 |
std::string c = comment; |
460 |
CharsetId charsetId = undefined;
|
|
461 |
charsetId_ = undefined;
|
|
461 | 462 |
if (comment.length() > 8 && comment.substr(0, 8) == "charset=") { |
462 | 463 |
std::string::size_type pos = comment.find_first_of(' '); |
463 | 464 |
std::string name = comment.substr(8, pos-8); |
464 | 465 |
// Strip quotes (so you can also specify the charset without quotes) |
465 | 466 |
if (name[0] == '"') name = name.substr(1); |
466 | 467 |
if (name[name.length()-1] == '"') name = name.substr(0, name.length()-1); |
467 |
charsetId = CharsetInfo::charsetIdByName(name); |
|
468 |
if (charsetId == invalidCharsetId) { |
|
468 |
charsetId_ = CharsetInfo::charsetIdByName(name);
|
|
469 |
if (charsetId_ == invalidCharsetId) {
|
|
469 | 470 |
#ifndef SUPPRESS_WARNINGS |
470 | 471 |
std::cerr << "Warning: " << Error(28, name) << "\n"; |
471 | 472 |
#endif |
... | ... | |
474 | 475 |
c.clear(); |
475 | 476 |
if (pos != std::string::npos) c = comment.substr(pos+1); |
476 | 477 |
} |
477 |
const std::string code(CharsetInfo::code(charsetId), 8); |
|
478 |
return StringValueBase::read(code + c); |
|
478 |
value_ = c; |
|
479 |
|
|
480 |
return 0; |
|
479 | 481 |
} |
482 |
|
|
483 |
int CommentValue::read(const byte* buf, long len, ByteOrder byteOrder) |
|
484 |
{ |
|
485 |
if (buf) { |
|
486 |
std::string rawValue = std::string(reinterpret_cast<const char*>(buf), len); |
|
487 |
if (rawValue.length() < 8) { |
|
488 |
return 0; |
|
489 |
} |
|
490 |
charsetId_ = CharsetInfo::charsetIdByCode(rawValue.substr(0, 8)); |
|
491 |
value_ = std::string(rawValue.substr(8)); |
|
492 |
switch (charsetId_) { |
|
493 |
case unicode: |
|
494 |
if (byteOrder == littleEndian) { |
|
495 |
Exiv2::convertStringCharset(value_, "UCS-2LE", "UTF-8"); |
|
496 |
} else { |
|
497 |
Exiv2::convertStringCharset(value_, "UCS-2BE", "UTF-8"); |
|
498 |
} |
|
499 |
break; |
|
500 |
|
|
501 |
case ascii: |
|
502 |
break; |
|
503 |
|
|
504 |
case jis: |
|
505 |
// The Exif 2.2 specification mentions JIS X 208-1990 as the |
|
506 |
// encoding for "jis". The problem is that JIS X 208-1990 isn't |
|
507 |
// a character encoding - that is, it doesn't specify how to |
|
508 |
// encode a character set into bytes. Candidates (iconv names) are: |
|
509 |
// EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP, ISO-2022-JP-2 and ISO-2022-JP-1. |
|
510 |
// Pending a definitive resolution to this, we'll just leave any JIS |
|
511 |
// comment as we found it. |
|
512 |
break; |
|
513 |
|
|
514 |
default: |
|
515 |
break; |
|
516 |
} |
|
517 |
} |
|
518 |
return 0; |
|
519 |
} |
|
480 | 520 | |
481 | 521 |
std::ostream& CommentValue::write(std::ostream& os) const |
482 | 522 |
{ |
483 |
CharsetId charsetId = this->charsetId(); |
|
484 |
if (charsetId != undefined) { |
|
485 |
os << "charset=\"" << CharsetInfo::name(charsetId) << "\" "; |
|
523 |
if (charsetId_ != undefined) { |
|
524 |
os << "charset=\"" << CharsetInfo::name(charsetId_) << "\" "; |
|
486 | 525 |
} |
487 | 526 |
return os << comment(); |
488 | 527 |
} |
528 |
|
|
529 |
long CommentValue::copy(byte* buf, ByteOrder byteOrder) const |
|
530 |
{ |
|
531 |
std::string encoded = encode (byteOrder); |
|
532 |
memcpy(buf, encoded.c_str(), encoded.length()); |
|
533 |
return encoded.length(); |
|
534 |
} |
|
535 |
|
|
536 |
long CommentValue::count() const |
|
537 |
{ |
|
538 |
return encode(littleEndian).length(); |
|
539 |
} |
|
540 |
|
|
541 |
long CommentValue::size() const |
|
542 |
{ |
|
543 |
return encode(littleEndian).length(); |
|
544 |
} |
|
545 |
|
|
546 |
std::string CommentValue::encode(ByteOrder byteOrder) const |
|
547 |
{ |
|
548 |
std::string result = ""; |
|
549 |
result.append (std::string(CharsetInfo::code(charsetId()), 8)); |
|
550 |
switch (charsetId()) { |
|
551 |
case unicode: { |
|
552 |
std::string copyOfComment = std::string(comment()); |
|
553 |
if (byteOrder == littleEndian) { |
|
554 |
Exiv2::convertStringCharset(copyOfComment, "UTF-8", "UCS-2LE"); |
|
555 |
} else { |
|
556 |
Exiv2::convertStringCharset(copyOfComment, "UTF-8", "UCS-2BE"); |
|
557 |
} |
|
558 |
|
|
559 |
result.append (copyOfComment); |
|
560 |
return result; |
|
561 |
} |
|
562 |
|
|
563 |
default: |
|
564 |
result.append (comment()); |
|
565 |
return result; |
|
566 |
} |
|
567 |
|
|
568 |
return result; |
|
569 |
} |
|
489 | 570 | |
490 | 571 |
std::string CommentValue::comment() const |
491 | 572 |
{ |
492 |
if (value_.length() >= 8) return value_.substr(8); |
|
493 |
return ""; |
|
573 |
return value_; |
|
494 | 574 |
} |
495 | 575 | |
496 | 576 |
CommentValue::CharsetId CommentValue::charsetId() const |
497 | 577 |
{ |
498 |
CharsetId charsetId = undefined; |
|
499 |
if (value_.length() >= 8) { |
|
500 |
const std::string code = value_.substr(0, 8); |
|
501 |
charsetId = CharsetInfo::charsetIdByCode(code); |
|
502 |
} |
|
503 |
return charsetId; |
|
578 |
return charsetId_; |
|
504 | 579 |
} |
505 | 580 | |
506 | 581 |
CommentValue* CommentValue::clone_() const |
src/value.hpp (working copy) | ||
---|---|---|
581 | 581 |
*/ |
582 | 582 |
int read(const std::string& comment); |
583 | 583 |
//@} |
584 |
|
|
585 |
/*! |
|
586 |
@brief Read the value from a byte buffer |
|
587 |
*/ |
|
588 |
int read(const byte* buf, long len, ByteOrder /*byteOrder*/); |
|
584 | 589 | |
585 | 590 |
//! @name Accessors |
586 | 591 |
//@{ |
... | ... | |
590 | 595 |
read(const std::string& comment). |
591 | 596 |
*/ |
592 | 597 |
std::ostream& write(std::ostream& os) const; |
598 |
|
|
599 |
long copy(byte* buf, ByteOrder byteOrder) const; |
|
600 |
long count() const; |
|
601 |
long size() const; |
|
602 |
|
|
593 | 603 |
//! Return the comment (without a charset="..." prefix) |
594 | 604 |
std::string comment() const; |
595 | 605 |
//! Return the charset id of the comment |
... | ... | |
597 | 607 |
//@} |
598 | 608 | |
599 | 609 |
private: |
610 |
//! The character set of the comment string |
|
611 |
CharsetId charsetId_; |
|
612 |
|
|
613 |
//! Encodes this value as an EXIF-comment |
|
614 |
std::string encode(ByteOrder byteOrder) const; |
|
615 |
|
|
600 | 616 |
//! Internal virtual copy constructor. |
601 | 617 |
EXV_DLLLOCAL virtual CommentValue* clone_() const; |
602 | 618 |
test/exifcomment-encoding-test.sh (revision 0) | ||
---|---|---|
1 |
#! /bin/sh |
|
2 |
# Test driver for exiv2 Exif.Photo.UserComment character encoding tests |
|
3 |
scriptdir=`dirname $0` |
|
4 |
cd $scriptdir |
|
5 |
exiv2="../src/exiv2" |
|
6 | ||
7 |
# Function takes two parameters |
|
8 |
# |
|
9 |
# 1. A exiv2 comment spec |
|
10 |
# 2. The expected exiv2 hex dump of the UserComment value |
|
11 |
# |
|
12 |
function writeComment { |
|
13 |
cp ./data/exiv2-bug662.jpg ./tmp/exiv2-bug662.jpg |
|
14 |
$exiv2 mo "-Mset Exif.Photo.UserComment $1" ./tmp/exiv2-bug662.jpg |
|
15 |
res=`$exiv2 pr -PEnh ./tmp/exiv2-bug662.jpg | grep --after-context=200 UserComment` |
|
16 | ||
17 |
# remove newlines and the tag name |
|
18 |
res=`echo $res | colrm 1 12 | sed -e 's/"//g'` |
|
19 |
if [ "$res" != "$2" ] ; then |
|
20 |
echo "Expected:" |
|
21 |
echo $2 |
|
22 |
echo "Got:" |
|
23 |
echo $res |
|
24 |
return 1 |
|
25 |
fi |
|
26 |
} |
|
27 | ||
28 |
writeComment "charset=Ascii An ascii comment" "0000 41 53 43 49 49 00 00 00 41 6e 20 61 73 63 69 69 ASCII...An ascii 0010 20 63 6f 6d 6d 65 6e 74 comment" |
|
29 |
writeComment "charset=Ascii A\\nnewline" "0000 41 53 43 49 49 00 00 00 41 0a 6e 65 77 6c 69 6e ASCII...A.newlin 0010 65 e" |
|
30 |
writeComment "charset=Unicode A Unicode comment" "0000 55 4e 49 43 4f 44 45 00 41 00 20 00 55 00 6e 00 UNICODE.A. .U.n. 0010 69 00 63 00 6f 00 64 00 65 00 20 00 63 00 6f 00 i.c.o.d.e. .c.o. 0020 6d 00 6d 00 65 00 6e 00 74 00 m.m.e.n.t." |
|
31 |
writeComment "charset=Unicode \\u01c4" "0000 55 4e 49 43 4f 44 45 00 c4 01 UNICODE..." |
|
32 |
writeComment "charset=Unicode A\\u01c4C" "0000 55 4e 49 43 4f 44 45 00 41 00 c4 01 43 00 UNICODE.A...C." |
|
33 |
writeComment "charset=Unicode With\\nNewline" "0000 55 4e 49 43 4f 44 45 00 57 00 69 00 74 00 68 00 UNICODE.W.i.t.h. 0010 0a 00 4e 00 65 00 77 00 6c 00 69 00 6e 00 65 00 ..N.e.w.l.i.n.e." |
|
34 |
writeComment "charset=Unicode With\\tTab" "0000 55 4e 49 43 4f 44 45 00 57 00 69 00 74 00 68 00 UNICODE.W.i.t.h. 0010 09 00 54 00 61 00 62 00 ..T.a.b." |
|
35 | ||
36 |
# Test invalid escape sequences |
|
37 |
writeComment "charset=Unicode \\ugggg" "0000 55 4e 49 43 4f 44 45 00 5c 00 75 00 67 00 67 00 UNICODE.\.u.g.g. 0010 67 00 67 00 g.g." |
test/Makefile (working copy) | ||
---|---|---|
61 | 61 |
bugfixes-test.sh \ |
62 | 62 |
exifdata-test.sh \ |
63 | 63 |
exiv2-test.sh \ |
64 |
exifcomment-encoding-test.sh \ |
|
64 | 65 |
imagetest.sh \ |
65 | 66 |
iotest.sh \ |
66 | 67 |
iptctest.sh \ |