Following on from my previous TTS post using Chatterbox Turbo (or any other TTS like Kokoro or Qwen3-TTT), here is code to combine audio files in a single .m4b audiobook retaining author and title metadata, chapter markers, and a cover image. Mostly this is a Python wrapper to FFmpeg.

Again, disclaimers: don’t download and run. Code may not be safe.

The prequisite is an update version of Python and ffmpeg along with ffprobe. One method is via Homebrew, brew install python ffmpeg.

  • title and author information is read from the .epub but ffprobe is used to get the durations of each of the individual audio files, in order to create chapter breaks.
  • ffmpeg combines individual audio files into a .m4b, converting to aac in the process.
  • the Python zipfile library extracts the first image from the .epub and then,
  • ffmpeg is used again to add this image, assumed to be the book cover, to the metadata.
# audiobook.py - Make .m4b audiobook from individual .m4a/.mp3 TTS files
# (c) C.Y., mybyways.com
#  v0.1 15 Feb 26

import os, sys, time, glob, zipfile, subprocess
from epub2text import EPUBParser

METADATA_FILE = 'metadata.txt'
CHAPTERS_FILE = 'chapters.txt'
FFPROBE_CMD = 'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 -loglevel quiet'
FFMPEG_CONCAT_CMD = 'ffmpeg -y -f concat -safe 0 -i "{chapters_file}" -c:a aac -vn -i "{metadata_file}" -map_metadata 1 "{temp_file}"'
FFMPEG_COVER_CMD = 'ffmpeg -i "{temp_file}" -i "{cover_file}" -c copy -disposition:v attached_pic "{audiobook_file}"'
INPUT_FORMAT = '.flac'

#https://stackoverflow.com/questions/6405208/how-to-convert-numeric-string-ranges-to-a-list-in-python
def stringrange_to_list(text):
    return sum(((list(range(*[int(j) + k for k,j in enumerate(i.split('-'))]))
        if '-' in i else [int(i)]) for i in text.split(',')), [])

def start_timer():
    return time.perf_counter()

def stop_timer(tic):
    toc = time.perf_counter()
    tic = toc - tic
    min, sec = divmod(tic, 60)
    return tic, int(min), int(sec)

def list_epub_chapters(epub_file):
    epub = EPUBParser(epub_file)
    metadata = epub.get_metadata()
    print(f'''Book: {epub_file}
Title: {metadata.title}
Author(s): {", ".join(metadata.authors)}''')
    for c, chapter in enumerate(epub.get_chapters()):
        print(f' {c+1:>3}. {chapter.title}: {chapter.char_count:,} characters')

def save_metadata(epub_file, audio_dir, chapter_range):
    epub = EPUBParser(epub_file)
    metadata = epub.get_metadata()
    chapters = epub.get_chapters()
    metadata_file = os.path.join(audio_dir, METADATA_FILE)
    chapters_file = os.path.join(audio_dir, CHAPTERS_FILE)
    with open(metadata_file, 'w') as meta, open(chapters_file, 'w') as chap:
        meta.write(f''';FFMETADATA1
title={metadata.title}
artist={", ".join(metadata.authors)}
''')
        start = 0
        for i, c in enumerate(chapter_range):
            audio_file = os.path.join(audio_dir, f'{c:03}{INPUT_FORMAT}')
            if os.path.isfile(audio_file):
                chap.write(f"file '{c:03}{INPUT_FORMAT}'\n")
                result = subprocess.run(f'{FFPROBE_CMD} "{audio_file}"', shell=True, stdout=subprocess.PIPE)
                end = start + int(float(result.stdout) * 1000)
                j = i if chapter_range == None else chapter_range[i]-1
                meta.write(f'''[CHAPTER]
TIMEBASE=1/1000
START={start}
END={end}
title={i+1}. {chapters[c-1].title}
''')
                start = end
    return metadata_file, chapters_file

def merge_audiobook(audio_dir, metadata_file, chapters_file):
    temp_file = os.path.join(audio_dir, 'output.m4b')
    result = subprocess.run(FFMPEG_CONCAT_CMD.format(metadata_file=metadata_file, chapters_file=chapters_file, temp_file=temp_file), shell=True, stdout=subprocess.PIPE)
    if result.returncode == 0:
        return temp_file

# Save first jpg/png image as cover and add to .m4b in same directory as .epub
def save_cover(epub_file, audio_dir, temp_file):
    with zipfile.ZipFile(epub_file, mode='r') as zip:
        for info in zip.infolist():
            if info.filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                result = zip.extract(info, audio_dir)
                audiobook_file = f'{os.path.splitext(os.path.basename(epub_file))[0]}.m4b'
                result = subprocess.run(FFMPEG_COVER_CMD.format(cover_file=result, temp_file=temp_file, audiobook_file=audiobook_file), shell=True, stdout=subprocess.PIPE)
                return audiobook_file

match len(sys.argv):
    case 2:
        epub_file = sys.argv[1]
        if os.path.isfile(epub_file) and epub_file.lower().endswith('.epub'):
            list_epub_chapters(epub_file)
    case 3:
        epub_file = sys.argv[1]
        chapter_range = sys.argv[2]
        if os.path.isfile(epub_file) and epub_file.lower().endswith('.epub'):

            audio_dir = os.path.splitext(os.path.basename(epub_file))[0]
            if os.path.isdir(audio_dir):
                chapter_range = stringrange_to_list(chapter_range)
                tic = start_timer()
                metadata_file, chapters_file = save_metadata(epub_file, audio_dir, chapter_range)
                temp_file = merge_audiobook(audio_dir, metadata_file, chapters_file)
                if temp_file: 
                    audiobook_file = save_cover(epub_file, audio_dir, temp_file)
                tic, min, sec = stop_timer(tic)
                print(f'Completed to {audiobook_file} in {min:,} minutes {sec} seconds')
    case _:
        cmd = sys.argv[0]
        print(f'''{cmd} epub_file - list .epub chapters (file must exist)
{cmd} epub_file chapter_range - merge audio files into single .m4b for given chapters''')

Notes:

  • Cover image extraction is very dumb - just assume the fist image is the cover...
  • Edit the global variables as needed - these many not be the optimal ffmpeg or ffprobe commands but they work for me.
  • Creates a temporary directory following the .epub filename, but saves the final .m4b alongside the .epub. Assuming all goes well, feel free to delete the temporary directory once the audiobook is generated.
  • I updated my previous Chatterbox Turbo TTS code to generate .flac but the audiobook will be .aac format.

Usage is similar to previous code:

  • running python audiobook.py book.epub will list the chapters of an .epub book.
  • running python audiobook.py book.epub 1,4-6 will combine chapter 1, 4, 5 and 6 into an audiobook.