Creating an audiobook (.m4b) with FFmpeg

Following on from my previous TTS post using Chatterbox Turbo (or any other TTS like Kokoro or Qwen3-TTT), here is code to combine audio files in a single .m4b audiobook retaining author and title metadata, chapter markers, and a cover image. Mostly this is a Python wrapper to FFmpeg.

Again, disclaimers: don’t download and run. Code may not be safe.

The prequisite is an update version of Python and ffmpeg along with ffprobe. One method is via Homebrew, brew install python ffmpeg.

title and author information is read from the .epub but ffprobe is used to get the durations of each of the individual audio files, in order to create chapter breaks.
ffmpeg combines individual audio files into a .m4b, converting to aac in the process.
the Python zipfile library extracts the first image from the .epub and then,
ffmpeg is used again to add this image, assumed to be the book cover, to the metadata.

# audiobook.py - Make .m4b audiobook from individual .m4a/.mp3 TTS files
# (c) C.Y., mybyways.com
#  v0.1 15 Feb 26

import os, sys, time, glob, zipfile, subprocess
from epub2text import EPUBParser

METADATA_FILE = 'metadata.txt'
CHAPTERS_FILE = 'chapters.txt'
FFPROBE_CMD = 'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 -loglevel quiet'
FFMPEG_CONCAT_CMD = 'ffmpeg -y -f concat -safe 0 -i "{chapters_file}" -c:a aac -vn -i "{metadata_file}" -map_metadata 1 "{temp_file}"'
FFMPEG_COVER_CMD = 'ffmpeg -i "{temp_file}" -i "{cover_file}" -c copy -disposition:v attached_pic "{audiobook_file}"'
INPUT_FORMAT = '.flac'

#https://stackoverflow.com/questions/6405208/how-to-convert-numeric-string-ranges-to-a-list-in-python
def stringrange_to_list(text):
    return sum(((list(range(*[int(j) + k for k,j in enumerate(i.split('-'))]))
        if '-' in i else [int(i)]) for i in text.split(',')), [])

def start_timer():
    return time.perf_counter()

def stop_timer(tic):
    toc = time.perf_counter()
    tic = toc - tic
    min, sec = divmod(tic, 60)
    return tic, int(min), int(sec)

def list_epub_chapters(epub_file):
    epub = EPUBParser(epub_file)
    metadata = epub.get_metadata()
    print(f'''Book: {epub_file}
Title: {metadata.title}
Author(s): {", ".join(metadata.authors)}''')
    for c, chapter in enumerate(epub.get_chapters()):
        print(f' {c+1:>3}. {chapter.title}: {chapter.char_count:,} characters')

def save_metadata(epub_file, audio_dir, chapter_range):
    epub = EPUBParser(epub_file)
    metadata = epub.get_metadata()
    chapters = epub.get_chapters()
    metadata_file = os.path.join(audio_dir, METADATA_FILE)
    chapters_file = os.path.join(audio_dir, CHAPTERS_FILE)
    with open(metadata_file, 'w') as meta, open(chapters_file, 'w') as chap:
        meta.write(f''';FFMETADATA1
title={metadata.title}
artist={", ".join(metadata.authors)}
''')
        start = 0
        for i, c in enumerate(chapter_range):
            audio_file = os.path.join(audio_dir, f'{c:03}{INPUT_FORMAT}')
            if os.path.isfile(audio_file):
                chap.write(f"file '{c:03}{INPUT_FORMAT}'\n")
                result = subprocess.run(f'{FFPROBE_CMD} "{audio_file}"', shell=True, stdout=subprocess.PIPE)
                end = start + int(float(result.stdout) * 1000)
                j = i if chapter_range == None else chapter_range[i]-1
                meta.write(f'''[CHAPTER]
TIMEBASE=1/1000
START={start}
END={end}
title={i+1}. {chapters[c-1].title}
''')
                start = end
    return metadata_file, chapters_file

def merge_audiobook(audio_dir, metadata_file, chapters_file):
    temp_file = os.path.join(audio_dir, 'output.m4b')
    result = subprocess.run(FFMPEG_CONCAT_CMD.format(metadata_file=metadata_file, chapters_file=chapters_file, temp_file=temp_file), shell=True, stdout=subprocess.PIPE)
    if result.returncode == 0:
        return temp_file

# Save first jpg/png image as cover and add to .m4b in same directory as .epub
def save_cover(epub_file, audio_dir, temp_file):
    with zipfile.ZipFile(epub_file, mode='r') as zip:
        for info in zip.infolist():
            if info.filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                result = zip.extract(info, audio_dir)
                audiobook_file = f'{os.path.splitext(os.path.basename(epub_file))[0]}.m4b'
                result = subprocess.run(FFMPEG_COVER_CMD.format(cover_file=result, temp_file=temp_file, audiobook_file=audiobook_file), shell=True, stdout=subprocess.PIPE)
                return audiobook_file

match len(sys.argv):
    case 2:
        epub_file = sys.argv[1]
        if os.path.isfile(epub_file) and epub_file.lower().endswith('.epub'):
            list_epub_chapters(epub_file)
    case 3:
        epub_file = sys.argv[1]
        chapter_range = sys.argv[2]
        if os.path.isfile(epub_file) and epub_file.lower().endswith('.epub'):

            audio_dir = os.path.splitext(os.path.basename(epub_file))[0]
            if os.path.isdir(audio_dir):
                chapter_range = stringrange_to_list(chapter_range)
                tic = start_timer()
                metadata_file, chapters_file = save_metadata(epub_file, audio_dir, chapter_range)
                temp_file = merge_audiobook(audio_dir, metadata_file, chapters_file)
                if temp_file: 
                    audiobook_file = save_cover(epub_file, audio_dir, temp_file)
                tic, min, sec = stop_timer(tic)
                print(f'Completed to {audiobook_file} in {min:,} minutes {sec} seconds')
    case _:
        cmd = sys.argv[0]
        print(f'''{cmd} epub_file - list .epub chapters (file must exist)
{cmd} epub_file chapter_range - merge audio files into single .m4b for given chapters''')

Notes:

Cover image extraction is very dumb - just assume the fist image is the cover...
Edit the global variables as needed - these many not be the optimal ffmpeg or ffprobe commands but they work for me.
Creates a temporary directory following the .epub filename, but saves the final .m4b alongside the .epub. Assuming all goes well, feel free to delete the temporary directory once the audiobook is generated.
I updated my previous Chatterbox Turbo TTS code to generate .flac but the audiobook will be .aac format.

Usage is similar to previous code:

running python audiobook.py book.epub will list the chapters of an .epub book.
running python audiobook.py book.epub 1,4-6 will combine chapter 1, 4, 5 and 6 into an audiobook.

❮ Older