python - メモリやディスクに保存せずに、大きなアーカイブを作成してストリーミングします

Question

ユーザーが複数の大きなファイルのアーカイブを一度にダウンロードできるようにしたい。ただし、ファイルとアーカイブが大きすぎて、メモリまたはサーバーのディスクに保存できない場合があります（他のサーバーからオンザフライでストリーミングされます）。ユーザーにストリーミングするときにアーカイブを生成したいと思います。

TarやZipなどの最も単純なものを使用できます。私はdjangoを使用しています。これにより、応答でジェネレーターまたはファイルのようなオブジェクトを返すことができます。このオブジェクトは、プロセスを推進するために使用できます。ただし、zipfileまたはtarfileライブラリを中心にこの種のものを構築する方法を理解するのに苦労しており、ファイルの読み取りや、構築時のアーカイブの読み取りをサポートしていない可能性があります。

イテレータをファイルのようなオブジェクトに変換する際のこの回答が役立つ場合があります。 tarfile#addfile反復可能ですが、すぐにそれをに渡すように見えるshutil.copyfileobjので、これは私が期待していたほどジェネレーターに適していません。

score 9 · Accepted Answer

9

結局、 SpiderOakZipStreamを使用しました。

于 2012-10-10T01:42:52.173 に答える

score 7 · Accepted Answer

これを行うには、圧縮せずにzipファイルを生成してストリーミングします。これは、基本的に、各ファイルのコンテンツの前にヘッダーを追加するだけです。そうです、ライブラリはこれをサポートしていませんが、ライブラリをハックして機能させることができます。

このコードは、ストリームを管理し、ファイルが到着したときにzipfile.ZipInfoのインスタンスを作成するクラスでzipfile.ZipFileをラップします。CRCとサイズは最後に設定できます。put_file（）、write（）、flush（）を使用して入力ストリームからデータをプッシュし、read（）を使用してデータを出力ストリームに読み取ることができます。

import struct      
import zipfile
import time

from StringIO import StringIO

class ZipStreamer(object):
    def __init__(self):
        self.out_stream = StringIO()

        # write to the stringIO with no compression
        self.zipfile = zipfile.ZipFile(self.out_stream, 'w', zipfile.ZIP_STORED)

        self.current_file = None

        self._last_streamed = 0

    def put_file(self, name, date_time=None):
        if date_time is None:
            date_time = time.localtime(time.time())[:6]

        zinfo = zipfile.ZipInfo(name, date_time)
        zinfo.compress_type = zipfile.ZIP_STORED
        zinfo.flag_bits = 0x08
        zinfo.external_attr = 0600 << 16
        zinfo.header_offset = self.out_stream.pos

        # write right values later
        zinfo.CRC = 0
        zinfo.file_size = 0
        zinfo.compress_size = 0

        self.zipfile._writecheck(zinfo)

        # write header to stream
        self.out_stream.write(zinfo.FileHeader())

        self.current_file = zinfo

    def flush(self):
        zinfo = self.current_file
        self.out_stream.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size, zinfo.file_size))
        self.zipfile.filelist.append(zinfo)
        self.zipfile.NameToInfo[zinfo.filename] = zinfo
        self.current_file = None

    def write(self, bytes):
        self.out_stream.write(bytes)
        self.out_stream.flush()
        zinfo = self.current_file
        # update these...
        zinfo.CRC = zipfile.crc32(bytes, zinfo.CRC) & 0xffffffff
        zinfo.file_size += len(bytes)
        zinfo.compress_size += len(bytes)

    def read(self):
        i = self.out_stream.pos

        self.out_stream.seek(self._last_streamed)
        bytes = self.out_stream.read()

        self.out_stream.seek(i)
        self._last_streamed = i

        return bytes

    def close(self):
        self.zipfile.close()

このコードは概念実証にすぎず、httpサーバー自体にこの問題を処理させることにした後は、それ以上の開発やテストは行わなかったことを覚えておいてください。これを使用する場合に検討する必要があるいくつかのことは、ネストされたフォルダーが正しくアーカイブされているかどうか、およびファイル名のエンコード（とにかくzipファイルでは常に面倒です）を確認することです。

score 7 · Accepted Answer

を実装するようなファイルでfileobjをラップすることにより、ZipFileをPylonsまたはDjangoの応答fileobjにストリーミングできますtell()。これにより、zip内の個々のファイルがメモリにバッファリングされますが、zip自体はストリーミングされます。これを使用して、画像でいっぱいのzipファイルをストリーミングダウンロードするため、メモリに複数の画像をバッファリングすることはありません。

この例はにストリーミングしsys.stdoutます。Pylonsの場合response.body_fileは、Djangoの場合は、HttpResponseそれ自体をファイルとして使用できます。

import zipfile
import sys


class StreamFile(object):
    def __init__(self, fileobj):
        self.fileobj = fileobj
        self.pos = 0

    def write(self, str):
        self.fileobj.write(str)
        self.pos += len(str)

    def tell(self):
        return self.pos

    def flush(self):
        self.fileobj.flush()


# Wrap a stream so ZipFile can use it
out = StreamFile(sys.stdout)
z = zipfile.ZipFile(out, 'w', zipfile.ZIP_DEFLATED)

for i in range(5):
    z.writestr("hello{0}.txt".format(i), "this is hello{0} contents\n".format(i) * 3)

z.close()

score 3 · Accepted Answer

これがPedroWerneck（上から）からの解決策ですが、メモリ内のすべてのデータの収集を回避するための修正があります（read方法は少し修正されています）：

class ZipStreamer(object):
    def __init__(self):
        self.out_stream = StringIO.StringIO()

        # write to the stringIO with no compression
        self.zipfile = zipfile.ZipFile(self.out_stream, 'w', zipfile.ZIP_STORED)

        self.current_file = None

        self._last_streamed = 0

    def put_file(self, name, date_time=None):
        if date_time is None:
            date_time = time.localtime(time.time())[:6]

        zinfo = zipfile.ZipInfo(name, date_time)
        zinfo.compress_type = zipfile.ZIP_STORED
        zinfo.flag_bits = 0x08
        zinfo.external_attr = 0600 << 16
        zinfo.header_offset = self.out_stream.pos

        # write right values later
        zinfo.CRC = 0
        zinfo.file_size = 0
        zinfo.compress_size = 0

        self.zipfile._writecheck(zinfo)

        # write header to mega_streamer
        self.out_stream.write(zinfo.FileHeader())

        self.current_file = zinfo

    def flush(self):
        zinfo = self.current_file
        self.out_stream.write(
            struct.pack("<LLL", zinfo.CRC, zinfo.compress_size,
                        zinfo.file_size))
        self.zipfile.filelist.append(zinfo)
        self.zipfile.NameToInfo[zinfo.filename] = zinfo
        self.current_file = None

    def write(self, bytes):
        self.out_stream.write(bytes)
        self.out_stream.flush()
        zinfo = self.current_file
        # update these...
        zinfo.CRC = zipfile.crc32(bytes, zinfo.CRC) & 0xffffffff
        zinfo.file_size += len(bytes)
        zinfo.compress_size += len(bytes)

    def read(self):
        self.out_stream.seek(self._last_streamed)
        bytes = self.out_stream.read()
        self._last_streamed = 0

        # cleaning up memory in each iteration
        self.out_stream.seek(0) 
        self.out_stream.truncate()
        self.out_stream.flush()

        return bytes

    def close(self):
        self.zipfile.close()

次にstream_generator、zipファイルのストリームとして関数を使用できます

def stream_generator(files_paths):
    s = ZipStreamer()
    for f in files_paths:
        s.put_file(f)
        with open(f) as _f:
            s.write(_f.read())
        s.flush()
        yield s.read()
    s.close()

ファルコンの例：

class StreamZipEndpoint(object):
    def on_get(self, req, resp):
        files_pathes = [
            '/path/to/file/1',
            '/path/to/file/2',
        ]
        zip_filename = 'output_filename.zip'
        resp.content_type = 'application/zip'
        resp.set_headers([
            ('Content-Disposition', 'attachment; filename="%s"' % (
                zip_filename,))
        ])

        resp.stream = stream_generator(files_pathes)

score 0 · Accepted Answer

オプションはstream-zipを使用することです（完全な開示：私が書いた）

その例を少し修正します。

from datetime import datetime
from stream_zip import stream_zip, ZIP_64

def non_zipped_files():
    modified_at = datetime.now()
    perms = 0o600

    # Hard coded in this example, but in real cases could
    # for example yield data from a remote source
    def file_1_data():
        for i in range(0, 1000):
            yield b'Some bytes'

    def file_2_data():
        for i in range(0, 1000):
            yield b'Some bytes'

    yield 'my-file-1.txt', modified_at, perms, ZIP64, file_1_data()
    yield 'my-file-2.txt', modified_at, perms, ZIP64, file_2_data()

zipped_chunks = stream_zip(non_zipped_files())

# Can print each chunk, or return them to a client,
# say using Django's StreamingHttpResponse
for zipped_chunk in zipped_chunks:
    print(zipped_chunk)

python - メモリやディスクに保存せずに、大きなアーカイブを作成してストリーミングします

5 に答える 5

Related

Reference