# frozen_string_literal: true
require_relative "delete_suffix"
require_relative "match_p"
using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix)
using CSV::MatchP if CSV.const_defined?(:MatchP)
# Note: Don't use this class directly. This is an internal class.
# A CSV::Parser is m17n aware. The parser works in the Encoding of the IO
# or String object being read from or written to. Your data is never transcoded
# (unless you ask Ruby to transcode it for you) and will literally be parsed in
# the Encoding it is in. Thus CSV will return Arrays or Rows of Strings in the
# Encoding of your data. This is accomplished by transcoding the parser itself
# Raised when encoding is invalid.
class InvalidEncoding < StandardError
# CSV::Scanner receives a CSV output, scans it and return the content.
# It also controls the life cycle of the object with its methods +keep_start+,
# +keep_end+, +keep_back+, +keep_drop+.
# Uses StringScanner (the official strscan gem). Strscan provides lexical
# scanning operations on a String. We inherit its object and take advantage
# on the methods. For more information, please visit:
# https://ruby-doc.org/stdlib-2.6.1/libdoc/strscan/rdoc/StringScanner.html
class Scanner < StringScanner
alias_method :scan_all, :scan
def each_line(row_separator)
rest.each_line(row_separator) do |line|
position += line.bytesize
string.byteslice(start, pos - start)
# CSV::InputsScanner receives IO inputs, encoding and the chunk_size.
# It also controls the life cycle of the object with its methods +keep_start+,
# +keep_end+, +keep_back+, +keep_drop+.
# CSV::InputsScanner.scan() tries to match with pattern at the current position.
# If there's a match, the scanner advances the “scan pointer” and returns the matched string.
# Otherwise, the scanner returns nil.
# CSV::InputsScanner.rest() returns the “rest” of the string (i.e. everything after the scan pointer).
# If there is no more data (eos? = true), it returns "".
def initialize(inputs, encoding, chunk_size: 8192)
@last_scanner = @inputs.empty?
def each_line(row_separator)
n_row_separator_chars = row_separator.size
input.each_line(row_separator) do |line|
@scanner.pos += line.bytesize
if n_row_separator_chars == 2 and
buffer.end_with?(row_separator[0]) and
line.start_with?(row_separator[1])
position += buffer.bytesize + offset
if line.end_with?(row_separator)
position += line.bytesize + offset
offset = -buffer.bytesize if buffer
value = @scanner.scan(pattern)
return value if @last_scanner
read_chunk if @scanner.eos?
value = @scanner.scan(pattern)
return value if @last_scanner
while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
@keeps.push([@scanner.pos, nil])
start, buffer = @keeps.pop
keep = @scanner.string.byteslice(start, @scanner.pos - start)
start, buffer = @keeps.pop
keep = string.byteslice(start, string.bytesize - start)
if keep and not keep.empty?
@inputs.unshift(StringIO.new(keep))
@scanner = StringScanner.new(buffer)
read_chunk if @scanner.eos?
return false if @last_scanner
keep_data = string.byteslice(keep_start, @scanner.pos - keep_start)
raise InvalidEncoding unless string.valid_encoding?
@scanner = StringScanner.new(string)
@last_scanner = @inputs.empty?
chunk = input.gets(nil, @chunk_size)
raise InvalidEncoding unless chunk.valid_encoding?
@scanner = StringScanner.new(chunk)
if input.respond_to?(:eof?) and input.eof?
@last_scanner = @inputs.empty?
@scanner = StringScanner.new("".encode(@encoding))
@last_scanner = @inputs.empty?
def initialize(input, options)
@use_headers and @headers.nil?
return to_enum(__method__) unless block_given?
if @return_headers and @headers and @raw_headers
headers = Row.new(@headers, @raw_headers, true)
headers = add_unconverted_fields(headers, [])
@scanner ||= build_scanner
elsif @need_robust_parsing
parse_quotable_robust(&block)
parse_quotable_loose(&block)
message = "Invalid byte sequence in #{@encoding}"
raise MalformedCSVError.new(message, lineno)
# A set of tasks to prepare the file in order to parse it
@need_robust_parsing = false
@encoding = @options[:encoding]
liberal_parsing = @options[:liberal_parsing]
if liberal_parsing.is_a?(Hash)
@double_quote_outside_quote =
liberal_parsing[:double_quote_outside_quote]
@backslash_quote = liberal_parsing[:backslash_quote]
@double_quote_outside_quote = false
@need_robust_parsing = true
@unconverted_fields = @options[:unconverted_fields]
@field_size_limit = @options[:field_size_limit]
@skip_blanks = @options[:skip_blanks]
@fields_converter = @options[:fields_converter]
@header_fields_converter = @options[:header_fields_converter]
def prepare_quote_character
@quote_character = @options[:quote_character]
@escaped_quote_character = nil
@quote_character = @quote_character.to_s.encode(@encoding)
if @quote_character.length != 1
message = ":quote_char has to be nil or a single character String"
raise ArgumentError, message
@double_quote_character = @quote_character * 2
@escaped_quote_character = Regexp.escape(@quote_character)
@escaped_quote = Regexp.new(@escaped_quote_character)
return unless @backslash_quote
@backslash_character = "\\".encode(@encoding)
@escaped_backslash_character = Regexp.escape(@backslash_character)
@escaped_backslash = Regexp.new(@escaped_backslash_character)
@backslash_quote_character = nil
@backslash_quote_character =
@backslash_character + @escaped_quote_character
skip_lines = @options[:skip_lines]
@skip_lines = skip_lines.encode(@encoding)
unless skip_lines.respond_to?(:match)
":skip_lines has to respond to \#match: #{skip_lines.inspect}"
raise ArgumentError, message
@strip = @options[:strip]
raise ArgumentError, ":strip must not be an empty String"
raise ArgumentError, ":strip doesn't support 2 or more characters yet"
@strip = @strip.encode(@encoding)
@escaped_strip = Regexp.escape(@strip)
@strip_value = Regexp.new(@escaped_strip +
@need_robust_parsing = true
@escaped_strip = strip_values.encode(@encoding)
@strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
@need_robust_parsing = true
StringScanner.new("x").scan("x")
@@string_scanner_scan_accept_string = false
@@string_scanner_scan_accept_string = true
column_separator = @options[:column_separator]
@column_separator = column_separator.to_s.encode(@encoding)
if @column_separator.size < 1
message = ":col_sep must be 1 or more characters: "
message += column_separator.inspect
raise ArgumentError, message
resolve_row_separator(@options[:row_separator]).encode(@encoding)
@escaped_column_separator = Regexp.escape(@column_separator)
@escaped_first_column_separator = Regexp.escape(@column_separator[0])
if @column_separator.size > 1
@column_end = Regexp.new(@escaped_column_separator)
@column_ends = @column_separator.each_char.collect do |char|
Regexp.new(Regexp.escape(char))