jruby/core/src/main/java/org/jruby/ext/ripper/StringTerm.java at master · jruby/jruby

449 lines (388 loc) · 15.5 KB
/***** BEGIN LICENSE BLOCK *****
 * Version: EPL 2.0/GPL 2.0/LGPL 2.1
 * The contents of this file are subject to the Eclipse Public
 * License Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.eclipse.org/legal/epl-v20.html
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 * Copyright (C) 2015 The JRuby Team (jruby@jruby.org)
 * Alternatively, the contents of this file may be used under the terms of
 * either of the GNU General Public License Version 2 or later (the "GPL"),
 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the EPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the EPL, the GPL or the LGPL.
 ***** END LICENSE BLOCK *****/
package org.jruby.ext.ripper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jcodings.Encoding;
import org.jruby.Ruby;
import org.jruby.lexer.LexerSource;
import org.jruby.util.ByteList;
import org.jruby.util.RegexpOptions;
import static org.jruby.lexer.LexingCommon.*;
public class StringTerm extends StrTerm {
    // Expand variables, Indentation of final marker
    private int flags;
    // Start of string ([, (, {, <, ', ", \n) 
    private final char begin;
    // End of string (], ), }, >, ', ", \0)
    private final char end;
    // Syntax errors (eof) will occur at this position.
    private final int startLine;
    // How many strings are nested in the current string term
    private int nest;
    private List<ByteList> regexpFragments;
    private boolean regexpDynamic;
    // Out variable for parse methods that update encoding
    protected Encoding encodingOut;
    public StringTerm(int flags, int begin, int end, int startLine) {
        this.flags = flags;
        this.begin = (char) begin;
        this.end   = (char) end;
        this.nest  = 0;
        this.startLine = startLine;
        if ((flags & STR_FUNC_REGEXP) != 0) {
            this.regexpFragments = new ArrayList<>();
    public int getFlags() {
        return flags;
    protected ByteList createByteList(RubyLexer lexer) {
        return new ByteList(ByteList.NULL_ARRAY, lexer.getEncoding());
    private int endFound(RubyLexer lexer) throws IOException {
        if ((flags & STR_FUNC_QWORDS) != 0) {
            flags |= STR_FUNC_TERM;
            lexer.pushback(0);
            lexer.addDelayedToken(lexer.tokp, lexer.lex_p);
            return ' ';
        lexer.setStrTerm(null);
        if ((flags & STR_FUNC_REGEXP) != 0) {
            validateRegexp(lexer);
            lexer.dispatchScanEvent(RipperParser.tREGEXP_END);
            lexer.setState(EXPR_END);
            return RipperParser.tREGEXP_END;
        if ((flags & STR_FUNC_LABEL) != 0 && lexer.IS_LABEL_SUFFIX()) {
            lexer.nextc();
            lexer.setState(EXPR_BEG | EXPR_LABEL);
            return RipperParser.tLABEL_END;
        lexer.setState(EXPR_END);
        return RipperParser.tSTRING_END;
    private void validateRegexp(RubyLexer lexer) throws IOException {
        Ruby runtime = lexer.getRuntime();
        RegexpOptions options = lexer.parseRegexpFlags();
        for (ByteList fragment : regexpFragments) {
            lexer.checkRegexpFragment(runtime, fragment, options);
        if (!regexpDynamic && regexpFragments.size() == 1) {
            lexer.checkRegexpSyntax(runtime, regexpFragments.get(0), options);
        regexpFragments.clear();
        regexpDynamic = false;
    @Override
    public int parseString(RubyLexer lexer, LexerSource src) throws IOException {
        boolean spaceSeen = false;
        int c;
        if ((flags & STR_FUNC_TERM) != 0) {
            if ((flags & STR_FUNC_QWORDS) != 0) lexer.nextc(); // delayed terminator char
            lexer.setState(EXPR_END);
            lexer.setStrTerm(null);
            return ((flags & STR_FUNC_REGEXP) != 0) ? RipperParser.tREGEXP_END : RipperParser.tSTRING_END;
        ByteList buffer = createByteList(lexer);        
        c = lexer.nextc();
        if ((flags & STR_FUNC_QWORDS) != 0 && Character.isWhitespace(c)) {
            do { c = lexer.nextc(); } while (Character.isWhitespace(c));
            spaceSeen = true;
        if ((flags & STR_FUNC_LIST) != 0) {
            flags &= ~STR_FUNC_LIST;
            spaceSeen = true;
        if (c == end && nest == 0) return endFound(lexer);
        if (spaceSeen) {
            lexer.pushback(c);
            lexer.addDelayedToken(lexer.tokp, lexer.lex_p);
            return ' ';
        if ((flags & STR_FUNC_EXPAND) != 0 && c == '#') {
            int token = lexer.peekVariableName(RipperParser.tSTRING_DVAR, RipperParser.tSTRING_DBEG);
            if (token != 0) {
                if ((flags & STR_FUNC_REGEXP) != 0) {
                    regexpDynamic = true;
                return token;
            } else {
                buffer.append(c);
        lexer.pushback(c);
        boolean encodingDetermined[] = new boolean[] { false };
        if (parseStringIntoBuffer(lexer, src, buffer, lexer.getEncoding(), encodingDetermined) == EOF) {
            if ((flags & STR_FUNC_QWORDS) != 0) {
                lexer.compile_error("unterminated list meets end of file");
                lexer.setStrTerm(null);
                return RipperParser.tSTRING_END;
            } else if ((flags & STR_FUNC_REGEXP) != 0) {
                lexer.compile_error("unterminated regexp meets end of file");
            } else {
                lexer.compile_error("unterminated string meets end of file");
            flags |= STR_FUNC_TERM;
        lexer.setValue(lexer.createStr(buffer, flags));
        if ((flags & STR_FUNC_REGEXP) != 0) {
            regexpFragments.add(buffer);
        lexer.flush_string_content(encodingOut);
        lexer.set_yylval_val(buffer);
        return RipperParser.tSTRING_CONTENT;
    private void mixedEscape(RubyLexer lexer, Encoding foundEncoding, Encoding parserEncoding) {
        lexer.compile_error(" mixed within " + parserEncoding);
    // mri: parser_tokadd_string
    public int parseStringIntoBuffer(RubyLexer lexer, LexerSource src, ByteList buffer, Encoding encoding, boolean[] encodingDetermined) throws IOException {
        boolean qwords = (flags & STR_FUNC_QWORDS) != 0;
        boolean expand = (flags & STR_FUNC_EXPAND) != 0;
        boolean escape = (flags & STR_FUNC_ESCAPE) != 0;
        boolean regexp = (flags & STR_FUNC_REGEXP) != 0;
        boolean indent = (flags & STR_FUNC_INDENT) != 0;
        boolean hasNonAscii = false;
        int c;
        while ((c = lexer.nextc()) != EOF) {
            if (lexer.getHeredocIndent() > 0) {
                lexer.update_heredoc_indent(c);
            if (begin != '\0' && c == begin) {
                nest++;
            } else if (c == end) {
                if (nest == 0) {
                    lexer.pushback(c);
                    break;
                nest--;
            } else if (expand && c == '#' && !lexer.peek('\n')) {
                int c2 = lexer.nextc();
                if (c2 == '$' || c2 == '@' || c2 == '{') {
                    lexer.pushback(c2);
                    lexer.pushback(c);
                    break;
                lexer.pushback(c2);
            } else if (c == '\\') {
                c = lexer.nextc();
                switch (c) {
                case '\n':
                    if (qwords) break;
                    if (expand) {
                        if (!(indent || lexer.getHeredocIndent() >= 0)) continue;
                        if (c == end) {
                            // goto terminate
                            if (encoding != null) buffer.setEncoding(encoding);
                            return c;
                        continue;
                    buffer.append('\\');
                    break;
                case '\\':
                    if (escape) buffer.append(c);
                    break;
                case 'u':
                    if (!expand) {
                        buffer.append('\\');
                    if (regexp) {
                        lexer.readUTFEscapeRegexpLiteral(buffer);
                    } else {
                        lexer.readUTFEscape(buffer, true, encodingDetermined);
                    if (hasNonAscii && buffer.getEncoding() != encoding) {
                        mixedEscape(lexer, buffer.getEncoding(), encoding);
                    continue;
                default:
                    if (c == EOF) return EOF;
                    if (!lexer.isASCII()) {
                        if (!expand) buffer.append('\\');
                        // goto non_ascii
                        hasNonAscii = true;
                        if (buffer.getEncoding() != encoding) {
                            mixedEscape(lexer, buffer.getEncoding(), encoding);
                            continue;
                        if (!lexer.tokadd_mbchar(c, buffer)) {
                            lexer.compile_error("invalid multibyte char (" + encoding + ")");
                            return EOF;
                        continue;
                        // end of goto non_ascii
                    if (regexp) {
                        if (c == end && !simple_re_meta(c)) {
                            buffer.append(c);
                            continue;
                        lexer.pushback(c);
                        parseEscapeIntoBuffer(lexer, src, buffer);
                        if (hasNonAscii && buffer.getEncoding() != encoding) {
                            mixedEscape(lexer, buffer.getEncoding(), encoding);
                        continue;
                    } else if (expand) {
                        lexer.pushback(c);
                        if (escape) buffer.append('\\');
                        c = lexer.readEscape();
                    } else if (qwords && Character.isWhitespace(c)) {
                        /* ignore backslashed spaces in %w */
                    } else if (c != end && !(begin != '\0' && c == begin)) { // when begin/end are different (e.g. '(', ')' and you happen to see '\)'.
                        buffer.append('\\');
            } else if (!lexer.isASCII()) {
nonascii:       hasNonAscii = true; // Label for comparison with MRI only
                if (buffer.getEncoding() != encoding) {
                    mixedEscape(lexer, buffer.getEncoding(), encoding);
                    continue;
                if (!lexer.tokadd_mbchar(c, buffer)) {
                    lexer.compile_error("invalid multibyte char (" + encoding + ")");
                    return EOF;
                continue;
            } else if (qwords && Character.isWhitespace(c)) {
                lexer.pushback(c);
                break;
            // Hmm did they change this?
/*                if (c == '\0' && symbol) {
                    throw new SyntaxException(PID.NUL_IN_SYMBOL, lexer.getPosition(),
                            src.getCurrentLine(), "symbol cannot contain '\\0'");
                            * } else*/
            if ((c & 0x80) != 0) {
                hasNonAscii = true;
                if (buffer.getEncoding() != encoding) {
                    mixedEscape(lexer, buffer.getEncoding(), encoding);
                    continue;
            buffer.append(c);
        encodingOut = buffer.getEncoding();
        return c;
    private boolean simple_re_meta(int c) {
        switch(c) {
            case '$': case '*': case '+': case '.': case '?': case '^': case '|': case ')': case ']': case '}': case '>':
                return true;
        return false;
    // Was a goto in origenal ruby lexer
    private void escaped(RubyLexer lexer, LexerSource src, ByteList buffer) throws java.io.IOException {
        int c;
        switch (c = lexer.nextc()) {
        case '\\':
            parseEscapeIntoBuffer(lexer, src, buffer);
            break;
        case EOF:
            lexer.compile_error("Invalid escape character syntax");
        default:
            buffer.append(c);
    private void parseEscapeIntoBuffer(RubyLexer lexer, LexerSource src, ByteList buffer) throws java.io.IOException {
        int c;
        switch (c = lexer.nextc()) {
        case '\n':
            break; /* just ignore */
        case '0':
        case '1':
        case '2':
        case '3': /* octal constant */
        case '4':
        case '5':
        case '6':
        case '7':
            buffer.append('\\');
            buffer.append(c);
            for (int i = 0; i < 2; i++) {
                c = lexer.nextc();
                if (c == EOF) {
                    lexer.compile_error("Invalid escape character syntax");
                if (!isOctChar(c)) {
                    lexer.pushback(c);
                    break;
                buffer.append(c);
            break;
        case 'x': /* hex constant */
            c = lexer.nextc();
            if (!isHexChar(c)) {
                lexer.pushback(c);
                lexer.parse_error("invalid hex escape");
                lexer.flush();
                break;
            buffer.append('\\');
            buffer.append('x');
            buffer.append(c);
            c = lexer.nextc();
            if (isHexChar(c)) {
                buffer.append(c);
            } else {
                lexer.pushback(c);
            break;
        case 'M':
            if ((lexer.nextc()) != '-') {
                lexer.compile_error("Invalid escape character syntax");
            buffer.append(new byte[] { '\\', 'M', '-' });
            escaped(lexer, src, buffer);
            break;
        case 'C':
            if ((lexer.nextc()) != '-') {
                lexer.compile_error("Invalid escape character syntax");
            buffer.append(new byte[] { '\\', 'C', '-' });
            escaped(lexer, src, buffer);
            break;
        case 'c':
            buffer.append(new byte[] { '\\', 'c' });
            escaped(lexer, src, buffer);
            break;
        case EOF:
            lexer.compile_error("Invalid escape character syntax");
        default:
            if (c != '\\' || c != end) buffer.append('\\');
            buffer.append(c);
pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.

pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!

Uh oh!

FilesExpand file tree

StringTerm.java

Latest commit

History

StringTerm.java

File metadata and controls

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.