원본 본문으로 이동하기

솔라/루씬 아리랑 한글 형태소 6.2.x 버전 부터 오류 해결법

박용서 - 루씬 한글분석기 커뮤니티 (이하 주소) 에서 제공하는 한글형태소가 솔라/루씬 6.2.x 버전부터 오류가 발생하여 적습니다. http://cafe.naver.com/korlucene 문제 6.2.0 버전부터 java.lang.NoClassDefFoundError: org/apache/lucene/analysis/util/CharacterUtils 오류가 납니다. 원인 : CharacterUtils 가 이동됨과 동시에 기능도 조금 바뀌었습니다. 해결법 1. http://cafe.naver.com/korlucene/1305 에서 프로젝트를 다운받습니다. 2. http://cafe.naver.com/korlucene/1304 에서 arirang-morph-1.0.3.jar 를 다운받습니다. 3. 위에서 받은 프로젝트의 lib 폴더를 생성 후 arirang-morph-1.0.3.jar 을 넣습니다. pom.xml 수정 4. 아래 디펜던시를 추가해주세요. <dependency> <groupId>com.argo</groupId> <artifactId>arirang-morph</artifactId> <version>1.0.3</version> <scope>system</scope> <systemPath>${project.basedir}/lib/arirang-morph-1.0.3.jar</systemPath> </dependency> 5. 아래 디펜던시를 제거해주세요. 원래 써있는 버전이 더 높은 버전이지만.. https://github.com/soomyung 에도 해당 버전을 찾을 수 없습니다. 파일로 구할 수 있는 것은 현재 1.0.3이 최신인 것 같습니다. <dependency> <groupId>com.argo</groupId> <artifactId>arirang-morph</artifactId> <version>1.1.0</version> </dependency> 6. 아래 디펜던시의 버전을 바꿔주세요. <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>6.2.1</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>6.2.1</version> </dependency> 7. org.apache.lucene.analysis.ko.KoreanTokenizer 의 소스코드를 고쳐야합니다. 7-1. CharacterUtils 의 경로와 소스가 바뀌었습니다. - 기존 : org.apache.lucene.analysis.util.CharacterUtils - 현재 : org.apache.lucene.analysis.CharacterUtils (6.2 이후) 7-2. CharacterUtils 는 더 이상 인스턴스로 만들 수 없으며, 오직 static으로 써야합니다. - 4-3. CharacterUtils 는 더 이상 codePointAt가 지원되지 않기 때문에 codePointAt를 구현 해야합니다. - http://cafe.naver.com/korlucene/1320 에 댓글을 달아주신 netman2k님의 소스를 보고 Character.codePointAt 가 있다는 것을 알았습니다. 7-3. CharacterUtils.codePointAt (존재하지 않는 메서드)를 Character.codePointAt 로 교체합니다. 8. 이렇게 모두 수정해주셨다면 mvn clear install 로 리빌드하여 기존의 arirang.lucene-analyzer-5.0-1.1.0.jar 를 교체합니다. http://cafe.naver.com/korlucene/1320 에 수정한 jar 파일 올려두었습니다. 추신 - CharacterUtils 변경점 기존 : org.apache.lucene.analysis.util.CharacterUtils 현재 : org.apache.lucene.analysis.CharacterUtils (6.2 이후) 기존코드 package org.apache.lucene.analysis.util; import java.io.IOException; import java.io.Reader; public abstract class CharacterUtils { private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils(); private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils(); public static CharacterUtils getInstance() { return JAVA_5; } @Deprecated public static CharacterUtils getJava4Instance() { return JAVA_4; } public abstract int codePointAt(CharSequence paramCharSequence, int paramInt); public abstract int codePointAt(char[] paramArrayOfChar, int paramInt1, int paramInt2); public abstract int codePointCount(CharSequence paramCharSequence); public static CharacterBuffer newCharacterBuffer(int bufferSize) { if (bufferSize < 2) { throw new IllegalArgumentException("buffersize must be >= 2"); } return new CharacterBuffer(new char[bufferSize], 0, 0); } public final void toLowerCase(char[] buffer, int offset, int limit) { assert (buffer.length >= limit); assert ((offset <= 0) && (offset <= buffer.length)); for (int i = offset; i < limit;) { i += Character.toChars( Character.toLowerCase( codePointAt(buffer, i, limit)), buffer, i); } } public final void toUpperCase(char[] buffer, int offset, int limit) { assert (buffer.length >= limit); assert ((offset <= 0) && (offset <= buffer.length)); for (int i = offset; i < limit;) { i += Character.toChars( Character.toUpperCase( codePointAt(buffer, i, limit)), buffer, i); } } public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) { if (srcLen < 0) { throw new IllegalArgumentException("srcLen must be >= 0"); } int codePointCount = 0; for (int i = 0; i < srcLen;) { int cp = codePointAt(src, srcOff + i, srcOff + srcLen); int charCount = Character.charCount(cp); dest[(destOff + codePointCount++)] = cp; i += charCount; } return codePointCount; } public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) { if (srcLen < 0) { throw new IllegalArgumentException("srcLen must be >= 0"); } int written = 0; for (int i = 0; i < srcLen; i++) { written += Character.toChars(src[(srcOff + i)], dest, destOff + written); } return written; } public abstract boolean fill(CharacterBuffer paramCharacterBuffer, Reader paramReader, int paramInt) throws IOException; public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException { return fill(buffer, reader, buffer.buffer.length); } public abstract int offsetByCodePoints(char[] paramArrayOfChar, int paramInt1, int paramInt2, int paramInt3, int paramInt4); static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException { int read = 0; while (read < len) { int r = reader.read(dest, offset + read, len - read); if (r == -1) { break; } read += r; } return read; } private static final class Java5CharacterUtils extends CharacterUtils { public int codePointAt(CharSequence seq, int offset) { return Character.codePointAt(seq, offset); } public int codePointAt(char[] chars, int offset, int limit) { return Character.codePointAt(chars, offset, limit); } public boolean fill(CharacterUtils.CharacterBuffer buffer, Reader reader, int numChars) throws IOException { assert (buffer.buffer.length >= 2); if ((numChars < 2) || (numChars > buffer.buffer.length)) { throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size"); } char[] charBuffer = buffer.buffer; buffer.offset = 0; int offset; int offset; if (buffer.lastTrailingHighSurrogate != 0) { charBuffer[0] = buffer.lastTrailingHighSurrogate; buffer.lastTrailingHighSurrogate = '\000'; offset = 1; } else { offset = 0; } int read = readFully(reader, charBuffer, offset, numChars - offset); buffer.length = (offset + read); boolean result = buffer.length == numChars; if (buffer.length < numChars) { return result; } if (Character.isHighSurrogate(charBuffer[(buffer.length - 1)])) { buffer.lastTrailingHighSurrogate = charBuffer[CharacterUtils.CharacterBuffer.access$206(buffer)]; } return result; } public int codePointCount(CharSequence seq) { return Character.codePointCount(seq, 0, seq.length()); } public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) { return Character.offsetByCodePoints(buf, start, count, index, offset); } } private static final class Java4CharacterUtils extends CharacterUtils { public int codePointAt(CharSequence seq, int offset) { return seq.charAt(offset); } public int codePointAt(char[] chars, int offset, int limit) { if (offset >= limit) { throw new IndexOutOfBoundsException("offset must be less than limit"); } return chars[offset]; } public boolean fill(CharacterUtils.CharacterBuffer buffer, Reader reader, int numChars) throws IOException { assert (buffer.buffer.length >= 1); if ((numChars < 1) || (numChars > buffer.buffer.length)) { throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size"); } buffer.offset = 0; int read = readFully(reader, buffer.buffer, 0, numChars); buffer.length = read; buffer.lastTrailingHighSurrogate = '\000'; return read == numChars; } public int codePointCount(CharSequence seq) { return seq.length(); } public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) { int result = index + offset; if ((result < 0) || (result > count)) { throw new IndexOutOfBoundsException(); } return result; } } public static final class CharacterBuffer { private final char[] buffer; private int offset; private int length; char lastTrailingHighSurrogate; CharacterBuffer(char[] buffer, int offset, int length) { this.buffer = buffer; this.offset = offset; this.length = length; } public char[] getBuffer() { return this.buffer; } public int getOffset() { return this.offset; } public int getLength() { return this.length; } public void reset() { this.offset = 0; this.length = 0; this.lastTrailingHighSurrogate = '\000'; } } } 6.2 이후 코드 /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.io.IOException; import java.io.Reader; /** * Utility class to write tokenizers or token filters. * @lucene.internal */ public final class CharacterUtils { private CharacterUtils() {} // no instantiation /** * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code> * of the given bufferSize. * * @param bufferSize * the internal char buffer size, must be <code>>= 2</code> * @return a new {@link CharacterBuffer} instance. */ public static CharacterBuffer newCharacterBuffer(final int bufferSize) { if (bufferSize < 2) { throw new IllegalArgumentException("buffersize must be >= 2"); } return new CharacterBuffer(new char[bufferSize], 0, 0); } /** * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting * at the given offset. * @param buffer the char buffer to lowercase * @param offset the offset to start at * @param limit the max char in the buffer to lower case */ public static void toLowerCase(final char[] buffer, final int offset, final int limit) { assert buffer.length >= limit; assert offset <=0 && offset <= buffer.length; for (int i = offset; i < limit;) { i += Character.toChars( Character.toLowerCase( Character.codePointAt(buffer, i, limit)), buffer, i); } } /** * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting * at the given offset. * @param buffer the char buffer to UPPERCASE * @param offset the offset to start at * @param limit the max char in the buffer to lower case */ public static void toUpperCase(final char[] buffer, final int offset, final int limit) { assert buffer.length >= limit; assert offset <=0 && offset <= buffer.length; for (int i = offset; i < limit;) { i += Character.toChars( Character.toUpperCase( Character.codePointAt(buffer, i, limit)), buffer, i); } } /** Converts a sequence of Java characters to a sequence of unicode code points. * @return the number of code points written to the destination buffer */ public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) { if (srcLen < 0) { throw new IllegalArgumentException("srcLen must be >= 0"); } int codePointCount = 0; for (int i = 0; i < srcLen; ) { final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen); final int charCount = Character.charCount(cp); dest[destOff + codePointCount++] = cp; i += charCount; } return codePointCount; } /** Converts a sequence of unicode code points to a sequence of Java characters. * @return the number of chars written to the destination buffer */ public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) { if (srcLen < 0) { throw new IllegalArgumentException("srcLen must be >= 0"); } int written = 0; for (int i = 0; i < srcLen; ++i) { written += Character.toChars(src[srcOff + i], dest, destOff + written); } return written; } /** * Fills the {@link CharacterBuffer} with characters read from the given * reader {@link Reader}. This method tries to read <code>numChars</code> * characters into the {@link CharacterBuffer}, each call to fill will start * filling the buffer from offset <code>0</code> up to <code>numChars</code>. * In case code points can span across 2 java characters, this method may * only fill <code>numChars - 1</code> characters in order not to split in * the middle of a surrogate pair, even if there are remaining characters in * the {@link Reader}. * <p> * This method guarantees * that the given {@link CharacterBuffer} will never contain a high surrogate * character as the last element in the buffer unless it is the last available * character in the reader. In other words, high and low surrogate pairs will * always be preserved across buffer boarders. * </p> * <p> * A return value of <code>false</code> means that this method call exhausted * the reader, but there may be some bytes which have been read, which can be * verified by checking whether <code>buffer.getLength() > 0</code>. * </p> * * @param buffer * the buffer to fill. * @param reader * the reader to read characters from. * @param numChars * the number of chars to read * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer * @throws IOException * if the reader throws an {@link IOException}. */ public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException { assert buffer.buffer.length >= 2; if (numChars < 2 || numChars > buffer.buffer.length) { throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size"); } final char[] charBuffer = buffer.buffer; buffer.offset = 0; final int offset; // Install the previously saved ending high surrogate: if (buffer.lastTrailingHighSurrogate != 0) { charBuffer[0] = buffer.lastTrailingHighSurrogate; buffer.lastTrailingHighSurrogate = 0; offset = 1; } else { offset = 0; } final int read = readFully(reader, charBuffer, offset, numChars - offset); buffer.length = offset + read; final boolean result = buffer.length == numChars; if (buffer.length < numChars) { // We failed to fill the buffer. Even if the last char is a high // surrogate, there is nothing we can do return result; } if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) { buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length]; } return result; } /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */ public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException { return fill(buffer, reader, buffer.buffer.length); } static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException { int read = 0; while (read < len) { final int r = reader.read(dest, offset + read, len - read); if (r == -1) { break; } read += r; } return read; } /** * A simple IO buffer to use with * {@link CharacterUtils#fill(CharacterBuffer, Reader)}. */ public static final class CharacterBuffer { private final char[] buffer; private int offset; private int length; // NOTE: not private so outer class can access without // $access methods: char lastTrailingHighSurrogate; CharacterBuffer(char[] buffer, int offset, int length) { this.buffer = buffer; this.offset = offset; this.length = length; } /** * Returns the internal buffer * * @return the buffer */ public char[] getBuffer() { return buffer; } /** * Returns the data offset in the internal buffer. * * @return the offset */ public int getOffset() { return offset; } /** * Return the length of the data in the internal buffer starting at * {@link #getOffset()} * * @return the length */ public int getLength() { return length; } /** * Resets the CharacterBuffer. All internals are reset to its default * values. */ public void reset() { offset = 0; length = 0; lastTrailingHighSurrogate = 0; } } } - 루씬 솔라 자바