chunk - break a large XML file into manageable chunks


/ Published in: Ruby
Save to your folder(s)

I’ve recently had to parse some pretty large XML documents, and needed a method to read one element at a time.

Here’s a fairly simple solution in Ruby form.


Copy this code and paste it in your HTML
  1. #
  2. # Chunk
  3. #
  4. # Reads a large file in as chunks for easier parsing.
  5. #
  6. # The chunks returned are whole <@@options['element']/>s found within file.
  7. #
  8. # Each call to read() returns the whole element including start and end tags.
  9. #
  10. # Tested with a 1.8MB file, extracted 500 elements in 0.09s
  11. # (with no work done, just extracting the elements)
  12. #
  13. # Usage:
  14. # <code>
  15. # # initialize the object
  16. # file = Chunk.new('chunk-test.xml', { 'element' => 'Chunk' })
  17. #
  18. # # loop through the file until all lines are read
  19. # while xml = file->read()
  20. # # do whatever you want with the string
  21. # puts xml
  22. # end
  23. # </code>
  24. #
  25. class Chunk
  26. # options
  27. #
  28. # @var hash Contains all major options
  29. #
  30. @@options = {
  31. 'path' => './',
  32. 'element' => '',
  33. 'chunkSize' => 512
  34. }
  35.  
  36. # file
  37. #
  38. # @var string The filename being read
  39. #
  40. @@file = ''
  41.  
  42. # pointer
  43. #
  44. # @var integer The current position the file is being read from
  45. #
  46. @@pointer = 0
  47.  
  48. # handle
  49. #
  50. # @var resource The File.open() resource
  51. #
  52. @@handle = nil
  53.  
  54. # reading
  55. #
  56. # @var boolean Whether the script is currently reading the file
  57. #
  58. @@reading = false
  59.  
  60. # readBuffer
  61. #
  62. # @var string Used to make sure start tags aren't missed
  63. #
  64. @@readBuffer = ''
  65.  
  66. # initialize
  67. #
  68. # Builds the Chunk object
  69. #
  70. # @param string $file The filename to work with
  71. # @param hash $options The options with which to parse the file
  72. #
  73. def initialize(file, options = {})
  74. # merge the options together
  75. @@options.merge!(options.kind_of?(Hash) ? options : {})
  76.  
  77. # check that the path ends with a /
  78. if @@options['path'][-1, 1] != '/'
  79. @@options['path'] += '/'
  80. end
  81.  
  82. # normalize the filename
  83. file = File.basename(file)
  84.  
  85. # make sure chunkSize is an int
  86. @@options['chunkSize'] = @@options['chunkSize'].to_i()
  87.  
  88. # check it's valid
  89. unless @@options['chunkSize'] >= 64
  90. @@options['chunkSize'] = 512
  91. end
  92.  
  93. # set the filename
  94. @@file = File.expand_path(@@options['path'] + file)
  95.  
  96. # check the file exists
  97. unless File.exists?(@@file)
  98. raise Exception.new('Cannot load file: ' + @@file)
  99. end
  100.  
  101. # open the file
  102. @@handle = File.new(@@file, 'r')
  103.  
  104. # check the file opened successfully
  105. unless @@handle
  106. raise Exception.new('Error opening file for reading')
  107. end
  108.  
  109. # add a __destruct style method
  110. ObjectSpace.define_finalizer(self, self.class.method(:finalize).to_proc)
  111. end
  112.  
  113. # finalize
  114. #
  115. # Cleans up
  116. #
  117. # @return void
  118. #
  119. def Chunk.finalize(id)
  120. @@handle.close()
  121. end
  122.  
  123. # read
  124. #
  125. # Reads the first available occurence of the XML element @@options['element']
  126. #
  127. # @return string The XML string from @@file
  128. #
  129. def read()
  130. # check we have an element specified
  131. if !@@options['element'].nil? and @@options['element'].strip().length() > 0
  132. # trim it
  133. element = @@options['element'].strip()
  134.  
  135. else
  136. element = nil
  137. end
  138.  
  139. # initialize the buffer
  140. buffer = ''
  141.  
  142. # if the element is empty
  143. if element.nil?
  144. # let the script know we're reading
  145. @@reading = true
  146.  
  147. # read in the whole doc, cos we don't know what's wanted
  148. while @@reading
  149. buffer += @@handle.read(@@options['chunkSize'])
  150.  
  151. @@reading = !@@handle.eof()
  152. end
  153.  
  154. # return it all
  155. return buffer
  156.  
  157. # we must be looking for a specific element
  158. else
  159. # set up the strings to find
  160. open = '<' + element + '>'
  161. close = '</' + element + '>'
  162.  
  163. # let the script know we're reading
  164. @@reading = true
  165.  
  166. # reset the global buffer
  167. @@readBuffer = ''
  168.  
  169. # this is used to ensure all data is read, and to make sure we don't send the start data again by mistake
  170. store = false
  171.  
  172. # seek to the position we need in the file
  173. @@handle.seek(@@pointer)
  174.  
  175. # start reading
  176. while @@reading and !@@handle.eof()
  177. # store the chunk in a temporary variable
  178. tmp = @@handle.read(@@options['chunkSize'])
  179.  
  180. # update the global buffer
  181. @@readBuffer += tmp
  182.  
  183. # check for the open string
  184. checkOpen = tmp.index(open)
  185.  
  186. # if it wasn't in the new buffer
  187. if checkOpen.nil? and !store
  188. # check the full buffer (in case it was only half in this buffer)
  189. checkOpen = @@readBuffer.index(open)
  190.  
  191. # if it was in there
  192. unless checkOpen.nil?
  193. # set it to the remainder
  194. checkOpen = checkOpen % @@options['chunkSize']
  195. end
  196. end
  197.  
  198. # check for the close string
  199. checkClose = tmp.index(close)
  200.  
  201. # if it wasn't in the new buffer
  202. if checkClose.nil? and store
  203. # check the full buffer (in case it was only half in this buffer)
  204. checkClose = @@readBuffer.index(close)
  205.  
  206. # if it was in there
  207. unless checkClose.nil?
  208. # set it to the remainder plus the length of the close string itself
  209. checkClose = (checkClose + close.length()) % @@options['chunkSize']
  210. end
  211.  
  212. # if it was
  213. elsif !checkClose.nil?
  214. # add the length of the close string itself
  215. checkClose += close.length()
  216. end
  217.  
  218. # if we've found the opening string and we're not already reading another element
  219. if !checkOpen.nil? and !store
  220. # if we're found the end element too
  221. if !checkClose.nil?
  222. # append the string only between the start and end element
  223. buffer += tmp[checkOpen, (checkClose - checkOpen)]
  224.  
  225. # update the pointer
  226. @@pointer += checkClose
  227.  
  228. # let the script know we're done
  229. @@reading = false
  230.  
  231. else
  232. # append the data we know to be part of this element
  233. buffer += tmp[checkOpen..-1]
  234.  
  235. # update the pointer
  236. @@pointer += @@options['chunkSize']
  237.  
  238. # let the script know we're gonna be storing all the data until we find the close element
  239. store = true
  240. end
  241.  
  242. # if we've found the closing element
  243. elsif !checkClose.nil?
  244. # update the buffer with the data upto and including the close tag
  245. buffer += tmp[0, checkClose]
  246.  
  247. # update the pointer
  248. @@pointer += checkClose
  249.  
  250. # let the script know we're done
  251. @@reading = false
  252.  
  253. # if we've found the closing element, but half in the previous chunk
  254. elsif store
  255. # update the buffer
  256. buffer += tmp
  257.  
  258. # and the pointer
  259. @@pointer += @@options['chunkSize']
  260. end
  261. end
  262. end
  263.  
  264. # return the element (or the whole file if we're not looking for elements)
  265. return (buffer == '') ? false : buffer
  266. end
  267. end

URL: http://www.dom111.co.uk/blog/coding/chunk-read-a-large-xml-file-a-chunk-at-a-time/99

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.