Project

General

Profile

Bug #662 » exiv2-exifcomment-unicode.patch

The patch. - Leo Sutic, 11 Jan 2010 15:09

View differences:

src/convert.cpp (working copy)
1325 1325
        std::string outstr;
1326 1326
        EXV_ICONV_CONST char *inptr = const_cast<char *>(str.c_str());
1327 1327
        size_t inbytesleft = str.length();
1328

  
1328
        int outbytesProduced = 0;
1329 1329
        while (inbytesleft) {
1330 1330
            char outbuf[100];
1331 1331
            char *outptr = outbuf;
......
1335 1335
                              &inbytesleft,
1336 1336
                              &outptr,
1337 1337
                              &outbytesleft);
1338
            outbytesProduced += sizeof(outbuf) - 1 - outbytesleft;
1338 1339
            if (rc == size_t(-1) && errno != E2BIG) {
1339 1340
#ifndef SUPPRESS_WARNINGS
1340 1341
                std::cerr << "Warning: iconv: "
......
1345 1346
                break;
1346 1347
            }
1347 1348
            *outptr = '\0';
1348
            outstr.append(outbuf);
1349
            outstr.append(std::string(outbuf, outbytesProduced));
1349 1350
        }
1350 1351
        if (cd != (iconv_t)(-1)) {
1351 1352
            iconv_close(cd);
src/exiv2.cpp (working copy)
41 41
#include "exiv2.hpp"
42 42
#include "actions.hpp"
43 43
#include "utils.hpp"
44
#include "convert.hpp"
44 45
#include "i18n.h"      // NLS support.
45 46
#include "xmp.hpp"
46 47

  
......
117 118
     */
118 119
    bool parseLine(ModifyCmd& modifyCmd,
119 120
                   const std::string& line, int num);
120

  
121
    
122
    /*!
123
      @brief Parses a string containing backslash-escapes
124
      @param input Input string, assumed to be UTF-8
125
     */
126
    std::string parseEscapes(const std::string& input);
121 127
}
122 128

  
123 129
// *****************************************************************************
......
1076 1082
                }
1077 1083
            }
1078 1084

  
1079
            value = line.substr(valStart, valEnd+1-valStart);
1085
            value = parseEscapes(line.substr(valStart, valEnd+1-valStart));
1080 1086
            std::string::size_type last = value.length()-1;
1081 1087
            if (   (value[0] == '"' && value[last] == '"')
1082 1088
                || (value[0] == '\'' && value[last] == '\'')) {
......
1108 1114
        return cmdIdAndString[i].cmdId_;
1109 1115
    }
1110 1116

  
1117
    std::string parseEscapes(const std::string& input) 
1118
    {
1119
        std::string result = "";
1120
        for (unsigned int i = 0; i < input.length(); ++i) {
1121
            char ch = input[i];
1122
            if (ch == '\\') {
1123
                int escapeStart = i;
1124
                if (input.length() - 1 > i) {
1125
                    ++i;
1126
                    ch = input[i];
1127
                    switch (ch) {
1128
                        // Escaping of backslash
1129
                        case '\\':
1130
                        result.push_back('\\');
1131
                        break;
1132
                        
1133
                        // Escaping of newline
1134
                        case 'n':
1135
                        result.push_back('\n');
1136
                        break;
1137
                        
1138
                        // Escaping of tab
1139
                        case 't':
1140
                        result.push_back('\t');
1141
                        break;
1142
                        
1143
                        // Escaping of unicode
1144
                        case 'u':
1145
                        if (input.length() - 4 > i) {
1146
                            int acc = 0;
1147
                            for (int j = 0; j < 4; ++j) {
1148
                                ++i;
1149
                                acc <<= 4;
1150
                                if (input[i] >= '0' && input[i] <= '9') {
1151
                                    acc |= input[i] - '0';
1152
                                } else if (input[i] >= 'a' && input[i] <= 'f') {
1153
                                    acc |= input[i] - 'a' + 10;
1154
                                } else if (input[i] >= 'A' && input[i] <= 'F') {
1155
                                    acc |= input[i] - 'A' + 10;
1156
                                } else {
1157
                                    acc = -1;
1158
                                    break;
1159
                                }
1160
                            }
1161
                            if (acc == -1) {
1162
                                result.push_back('\\');
1163
                                i = escapeStart;
1164
                                break;
1165
                            }
1166
                            
1167
                            std::string ucs2toUtf8 = "";
1168
                            ucs2toUtf8.push_back((char) ((acc & 0xff00) >> 8));
1169
                            ucs2toUtf8.push_back((char) (acc & 0x00ff));
1170
                            
1171
                            if (Exiv2::convertStringCharset (ucs2toUtf8, "UCS-2BE", "UTF-8")) {
1172
                                result.append (ucs2toUtf8);
1173
                            }
1174
                        } else {
1175
                            result.push_back('\\');
1176
                            result.push_back(ch);
1177
                        }
1178
                        break;
1179
                        
1180
                        default:
1181
                        result.push_back('\\');
1182
                        result.push_back(ch);
1183
                    }
1184
                } else {
1185
                    result.push_back(ch);
1186
                }
1187
            } else {
1188
                result.push_back(ch);
1189
            }
1190
        }
1191
        
1192
        return result;
1193
    }
1194
    
1111 1195
}
1196
        
src/value.cpp (working copy)
34 34
// included header files
35 35
#include "value.hpp"
36 36
#include "types.hpp"
37
#include "convert.hpp"
37 38
#include "error.hpp"
38 39

  
39 40
// + standard includes
......
457 458
    int CommentValue::read(const std::string& comment)
458 459
    {
459 460
        std::string c = comment;
460
        CharsetId charsetId = undefined;
461
        charsetId_ = undefined;
461 462
        if (comment.length() > 8 && comment.substr(0, 8) == "charset=") {
462 463
            std::string::size_type pos = comment.find_first_of(' ');
463 464
            std::string name = comment.substr(8, pos-8);
464 465
            // Strip quotes (so you can also specify the charset without quotes)
465 466
            if (name[0] == '"') name = name.substr(1);
466 467
            if (name[name.length()-1] == '"') name = name.substr(0, name.length()-1);
467
            charsetId = CharsetInfo::charsetIdByName(name);
468
            if (charsetId == invalidCharsetId) {
468
            charsetId_ = CharsetInfo::charsetIdByName(name);
469
            if (charsetId_ == invalidCharsetId) {
469 470
#ifndef SUPPRESS_WARNINGS
470 471
                std::cerr << "Warning: " << Error(28, name) << "\n";
471 472
#endif
......
474 475
            c.clear();
475 476
            if (pos != std::string::npos) c = comment.substr(pos+1);
476 477
        }
477
        const std::string code(CharsetInfo::code(charsetId), 8);
478
        return StringValueBase::read(code + c);
478
        value_ = c;
479
        
480
        return 0;
479 481
    }
482
    
483
    int CommentValue::read(const byte* buf, long len, ByteOrder byteOrder)
484
    {
485
        if (buf) {
486
            std::string rawValue = std::string(reinterpret_cast<const char*>(buf), len);
487
            if (rawValue.length() < 8) {
488
                return 0;
489
            }
490
            charsetId_ = CharsetInfo::charsetIdByCode(rawValue.substr(0, 8));
491
            value_ = std::string(rawValue.substr(8));
492
            switch (charsetId_) {
493
                case unicode:
494
                if (byteOrder == littleEndian) {
495
                    Exiv2::convertStringCharset(value_, "UCS-2LE", "UTF-8");
496
                } else {
497
                    Exiv2::convertStringCharset(value_, "UCS-2BE", "UTF-8");
498
                }
499
                break;
500
                
501
                case ascii:
502
                break;
503
                
504
                case jis:
505
                // The Exif 2.2 specification mentions JIS X 208-1990 as the
506
                // encoding for "jis". The problem is that JIS X 208-1990 isn't
507
                // a character encoding - that is, it doesn't specify how to
508
                // encode a character set into bytes. Candidates (iconv names) are: 
509
                // EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP, ISO-2022-JP-2 and ISO-2022-JP-1. 
510
                // Pending a definitive resolution to this, we'll just leave any JIS 
511
                // comment as we found it.
512
                break;
513
                
514
                default:
515
                break;
516
            }
517
        }
518
        return 0;
519
    }
480 520

  
481 521
    std::ostream& CommentValue::write(std::ostream& os) const
482 522
    {
483
        CharsetId charsetId = this->charsetId();
484
        if (charsetId != undefined) {
485
            os << "charset=\"" << CharsetInfo::name(charsetId) << "\" ";
523
        if (charsetId_ != undefined) {
524
            os << "charset=\"" << CharsetInfo::name(charsetId_) << "\" ";
486 525
        }
487 526
        return os << comment();
488 527
    }
528
    
529
    long CommentValue::copy(byte* buf, ByteOrder byteOrder) const
530
    {
531
        std::string encoded = encode (byteOrder);
532
        memcpy(buf, encoded.c_str(), encoded.length());
533
        return encoded.length();
534
    }
535
    
536
    long CommentValue::count() const
537
    {
538
        return encode(littleEndian).length();
539
    }
540
    
541
    long CommentValue::size() const 
542
    {
543
        return encode(littleEndian).length();
544
    }
545
    
546
    std::string CommentValue::encode(ByteOrder byteOrder) const
547
    {
548
        std::string result = "";
549
        result.append (std::string(CharsetInfo::code(charsetId()), 8));				
550
        switch (charsetId()) {
551
            case unicode: {
552
                std::string copyOfComment = std::string(comment());
553
                if (byteOrder == littleEndian) {
554
                    Exiv2::convertStringCharset(copyOfComment, "UTF-8", "UCS-2LE");
555
                } else {
556
                    Exiv2::convertStringCharset(copyOfComment, "UTF-8", "UCS-2BE");
557
                }
558
                
559
                result.append (copyOfComment);
560
                return result;
561
            }
562
            
563
            default:
564
                result.append (comment());
565
                return result;
566
        }
567
        
568
        return result;
569
    }
489 570

  
490 571
    std::string CommentValue::comment() const
491 572
    {
492
        if (value_.length() >= 8) return value_.substr(8);
493
        return "";
573
        return value_;
494 574
    }
495 575

  
496 576
    CommentValue::CharsetId CommentValue::charsetId() const
497 577
    {
498
        CharsetId charsetId = undefined;
499
        if (value_.length() >= 8) {
500
            const std::string code = value_.substr(0, 8);
501
            charsetId = CharsetInfo::charsetIdByCode(code);
502
        }
503
        return charsetId;
578
        return charsetId_;
504 579
    }
505 580

  
506 581
    CommentValue* CommentValue::clone_() const
src/value.hpp (working copy)
581 581
        */
582 582
        int read(const std::string& comment);
583 583
        //@}
584
        
585
        /*!
586
          @brief Read the value from a byte buffer
587
         */
588
        int read(const byte* buf, long len, ByteOrder /*byteOrder*/);
584 589

  
585 590
        //! @name Accessors
586 591
        //@{
......
590 595
          read(const std::string& comment).
591 596
         */
592 597
        std::ostream& write(std::ostream& os) const;
598
        
599
        long copy(byte* buf, ByteOrder byteOrder) const;
600
        long count() const;
601
        long size() const;
602
        
593 603
        //! Return the comment (without a charset="..." prefix)
594 604
        std::string comment() const;
595 605
        //! Return the charset id of the comment
......
597 607
        //@}
598 608

  
599 609
    private:
610
        //! The character set of the comment string
611
        CharsetId charsetId_;
612
        
613
        //! Encodes this value as an EXIF-comment
614
        std::string encode(ByteOrder byteOrder) const;
615
        
600 616
        //! Internal virtual copy constructor.
601 617
        EXV_DLLLOCAL virtual CommentValue* clone_() const;
602 618

  
test/exifcomment-encoding-test.sh (revision 0)
1
#! /bin/sh
2
# Test driver for exiv2 Exif.Photo.UserComment character encoding tests
3
scriptdir=`dirname $0`
4
cd $scriptdir
5
exiv2="../src/exiv2"
6

  
7
# Function takes two parameters
8
#
9
# 1. A exiv2 comment spec
10
# 2. The expected exiv2 hex dump of the UserComment value
11
#
12
function writeComment {
13
    cp ./data/exiv2-bug662.jpg ./tmp/exiv2-bug662.jpg
14
    $exiv2 mo "-Mset Exif.Photo.UserComment $1" ./tmp/exiv2-bug662.jpg
15
    res=`$exiv2 pr -PEnh ./tmp/exiv2-bug662.jpg | grep --after-context=200 UserComment`
16

  
17
    # remove newlines and the tag name
18
    res=`echo $res | colrm 1 12 | sed -e 's/"//g'`
19
    if [ "$res" != "$2" ] ; then
20
        echo "Expected:"
21
        echo $2
22
        echo "Got:"
23
        echo $res
24
        return 1
25
    fi
26
}
27

  
28
writeComment "charset=Ascii An ascii comment" "0000 41 53 43 49 49 00 00 00 41 6e 20 61 73 63 69 69 ASCII...An ascii 0010 20 63 6f 6d 6d 65 6e 74 comment"
29
writeComment "charset=Ascii A\\nnewline" "0000 41 53 43 49 49 00 00 00 41 0a 6e 65 77 6c 69 6e ASCII...A.newlin 0010 65 e"
30
writeComment "charset=Unicode A Unicode comment" "0000 55 4e 49 43 4f 44 45 00 41 00 20 00 55 00 6e 00 UNICODE.A. .U.n. 0010 69 00 63 00 6f 00 64 00 65 00 20 00 63 00 6f 00 i.c.o.d.e. .c.o. 0020 6d 00 6d 00 65 00 6e 00 74 00 m.m.e.n.t."
31
writeComment "charset=Unicode \\u01c4" "0000 55 4e 49 43 4f 44 45 00 c4 01 UNICODE..."
32
writeComment "charset=Unicode A\\u01c4C" "0000 55 4e 49 43 4f 44 45 00 41 00 c4 01 43 00 UNICODE.A...C."
33
writeComment "charset=Unicode With\\nNewline" "0000 55 4e 49 43 4f 44 45 00 57 00 69 00 74 00 68 00 UNICODE.W.i.t.h. 0010 0a 00 4e 00 65 00 77 00 6c 00 69 00 6e 00 65 00 ..N.e.w.l.i.n.e."
34
writeComment "charset=Unicode With\\tTab" "0000 55 4e 49 43 4f 44 45 00 57 00 69 00 74 00 68 00 UNICODE.W.i.t.h. 0010 09 00 54 00 61 00 62 00 ..T.a.b."
35

  
36
# Test invalid escape sequences
37
writeComment "charset=Unicode \\ugggg" "0000 55 4e 49 43 4f 44 45 00 5c 00 75 00 67 00 67 00 UNICODE.\.u.g.g. 0010 67 00 67 00 g.g."
test/Makefile (working copy)
61 61
        bugfixes-test.sh  \
62 62
        exifdata-test.sh  \
63 63
        exiv2-test.sh     \
64
        exifcomment-encoding-test.sh \
64 65
        imagetest.sh      \
65 66
        iotest.sh         \
66 67
        iptctest.sh       \
(2-2/5)