Hi there,
I'm trying to use libpcre3 for pattern matching. ;-) My problem is, that
I can't achieve that the german umlaut "Ä" (Ä) matches "ä" (ä)
when using CASELESS option for pattern matching. Since my project uses
unicode I tried it with UTF8. But using a character table didn't work
for me, too.
My project uses Qt4 and to create UTF8 strings I used some Qt code in my
minimal test code. Perhaps you could convert it, if you don't have qt
development files installed.
First I only had the mentioned problem (in the project code), that
Ä didn't match ä.
Now (in my example code) Ä even doesn't match Ä and I don't
know why. I tried quite a few things.
Minimal example code is attached.
My output is as follows:
codec: "UTF-8"
UNICODE_PROPERTIES: 1
match: -1
I believe I'm doing something stupid, so if you could show me an example
where I can caseless match "Ä" (Ä) with "ä" (ä), I would
really appreciate that.
Sincerely,
Enno Gröper
#############################################################################
# Makefile for building: cdcat
# Generated by qmake (2.01a) (Qt 4.2.3) on: Fr Okt 26 22:54:54 2007
# Project: cdcat.pro
# Template: app
# Command: /usr/bin/qmake-qt4 -unix -o Makefile cdcat.pro
#############################################################################
####### Compiler, tools and options
CC = gcc
CXX = g++
LEX = flex
YACC = yacc
DEFINES = -DQT_QT3SUPPORT_LIB -DQT3_SUPPORT -DQT_GUI_LIB -DQT_CORE_LIB
CFLAGS = -pipe -g -D_REENTRANT -Wall -W $(DEFINES)
CXXFLAGS = -pipe -g -D_REENTRANT -Wall -W $(DEFINES)
LEXFLAGS =
YACCFLAGS = -d
INCPATH = -I/usr/share/qt4/mkspecs/linux-g++ -I. -I/usr/include/qt4/QtCore -I/usr/include/qt4/QtCore -I/usr/include/qt4/QtGui -I/usr/include/qt4/QtGui -I/usr/include/qt4/Qt3Support -I/usr/include/qt4/Qt3Support -I/usr/include/qt4 -Imoc_files -I.
LINK = g++
LFLAGS =
LIBS = $(SUBLIBS) -L/usr/lib -lpcre -lQtCore
TARGET = pcre_tst
####### Files
SOURCES = pcre_tst.cpp
OBJECTS = pcre_tst.o
first: all
####### Implicit rules
.SUFFIXES: .o .c .cpp .cc .cxx .C
.cpp.o:
$(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
.cc.o:
$(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
.cxx.o:
$(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
.C.o:
$(CXX) -c $(CXXFLAGS) $(INCPATH) -o "$@" "$<"
.c.o:
$(CC) -c $(CFLAGS) $(INCPATH) -o "$@" "$<"
####### Build rules
all: $(TARGET)
$(TARGET): $(OBJECTS)
$(LINK) $(LFLAGS) -o $(TARGET) $(OBJECTS) $(OBJCOMP) $(LIBS)
clean:
-rm -f $(OBJECTS)
-rm -f $(TARGET)
####### Compile
pcre_tst.o: pcre_tst.cpp
$(CXX) -c $(CXXFLAGS) $(INCPATH) -o pcre_tst.o pcre_tst.cpp
#include <QString>
#include <QTextCodec>
#include <pcre.h>
#include <string.h>
#include <ctype.h>
#include <QtDebug>
int main(int argc, char * argv[]) {
QString patt;
QString txt;
pcre *re;
int pcre_opt = PCRE_UTF8;
pcre_extra *hints;
const char *error;
int errptr;
int offsets[99];
int match;
int utf8prop;
QTextCodec * codec;
QTextCodec::setCodecForCStrings(QTextCodec::codecForName("UTF-8"));
codec = QTextCodec::codecForCStrings();
if (codec == NULL)
qDebug() << "standard codec (latin-1)";
else
qDebug() << "codec:" << codec->name();
pcre_opt |= PCRE_CASELESS;
patt = "/Ä/";
QByteArray encPatt = patt.toUtf8();
pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &utf8prop);
qDebug() << "UNICODE_PROPERTIES:" << utf8prop;
//re = pcre_compile((const char*)patt.data(),pcre_opt,&error,&errptr,NULL);
re = pcre_compile(encPatt.constData(),pcre_opt,&error,&errptr,NULL);
if(re == NULL)
{
qDebug() << "(comp) Error in the pattern:" << error << ": " << errptr;
return 1;
}
hints = pcre_study(re,0,&error);
if(error != NULL)
{
qDebug() << "(study) Error in the pattern:" << error;
return 1;
}
txt = "Ä";
QByteArray enc = txt.toUtf8();
//match = pcre_exec(re,hints,(const char*)&txt,txt.size(),0,0,offsets,99);
match = pcre_exec(re,hints,enc.constData(),enc.size(),0,0,offsets,99);
qDebug() << "match:" << match;
return 0;
}