Hello,
We’re currently operating a production IPFS cluster consisting of three nodes. Our setup is as follows:
- Master Node: This node is responsible for handling application file uploads and IPNS publishing.
- Read Nodes: These two nodes serve as gateways to the public network and are positioned behind a load balancer.
All nodes run on Docker, with an IPFS-Cluster image managing pinning across all nodes. We handle a significant volume of files and approximately 20,000 IPNS names.
Problem Description
As our application usage increased, we’ve encountered several persistent issues. Our master node crashed regularly. After each crash, IPNS records did not recover automatically. To address this, we implemented a manual republishing system that republishes IPNS names every 24 hours. We also manage to fix the crashing issues by setting up connection manager and removing master node from public internet network (does not work as a GW anymore).
As our master node is not rebooting anymore we are now facing with another IPNS Resolution Failure: Despite our republishing system, IPNS names eventually fail to resolve on the read nodes. The only way to restore functionality is to reboot the master IPFS node (via Docker reset) and then restart the IPNS republishing process.
We have been struggling with these performance and stability issues and would appreciate any insights or solutions that could help resolve these problems.
Server Configuration
Below is our current master server configuration:
{
"API": {
"HTTPHeaders": {}
},
"Addresses": {
"API": "/ip4/0.0.0.0/tcp/5001",
"Announce": null,
"AppendAnnounce": null,
"Gateway": "/ip4/0.0.0.0/tcp/8080",
"NoAnnounce": null,
"Swarm": [
"/ip4/0.0.0.0/tcp/4001",
"/ip6/::/tcp/4001",
"/ip4/0.0.0.0/udp/4001/quic-v1",
"/ip4/0.0.0.0/udp/4001/quic-v1/webtransport",
"/ip6/::/udp/4001/quic-v1",
"/ip6/::/udp/4001/quic-v1/webtransport"
]
},
"AutoNAT": {},
"Bootstrap": [
"/dnsaddr/bootstrap.libp2p.io/p2p/QmcZf59bWwK5XFi76CZX8cbJ4BhTzzA3gU1ZjYZcYW3dwt",
"/ip4/104.131.131.82/tcp/4001/p2p/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ",
"/ip4/104.131.131.82/udp/4001/quic-v1/p2p/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ",
"/dnsaddr/bootstrap.libp2p.io/p2p/QmNnooDu7bfjPFoTZYxMNLWUQJyrVwtbZg5gBMjTezGAJN",
"/dnsaddr/bootstrap.libp2p.io/p2p/QmQCU2EcMqAqQPR2i9bChDtGNJchTbq5TbXJJ16u19uLTa",
"/dnsaddr/bootstrap.libp2p.io/p2p/QmbLHAnMoJPWSCR5Zhtx6BHJX9KiKNN6tpvbUcqanj75Nb",
"/ip4/xxx",
"/ip4/xxx"
],
"DNS": {
"Resolvers": {}
},
"Datastore": {
"BloomFilterSize": 0,
"GCPeriod": "720h",
"HashOnRead": false,
"Spec": {
"mounts": [
{
"child": {
"path": "blocks",
"shardFunc": "/repo/flatfs/shard/v1/next-to-last/2",
"sync": true,
"type": "flatfs"
},
"mountpoint": "/blocks",
"prefix": "flatfs.datastore",
"type": "measure"
},
{
"child": {
"compression": "none",
"path": "datastore",
"type": "levelds"
},
"mountpoint": "/",
"prefix": "leveldb.datastore",
"type": "measure"
}
],
"type": "mount"
},
"StorageGCWatermark": 100,
"StorageMax": "190GB"
},
"Discovery": {
"MDNS": {
"Enabled": true
}
},
"Experimental": {
"FilestoreEnabled": false,
"GraphsyncEnabled": false,
"Libp2pStreamMounting": false,
"P2pHttpProxy": false,
"StrategicProviding": false,
"UrlstoreEnabled": false
},
"Gateway": {
"APICommands": [],
"HTTPHeaders": {},
"NoDNSLink": false,
"NoFetch": false,
"PathPrefixes": [],
"PublicGateways": {
"mydomain1.com": {
"Paths": [
"/ipfs",
"/ipns"
],
"UseSubdomains": true
},
"mydomain2.io": {
"Paths": ["/ipfs", "/ipns"],
"UseSubdomains": true
}
},
"RootRedirect": "",
"Writable": false
},
"Identity": {
"PeerID": "XXX",
"PrivKey": "XXX"
},
"Internal": {},
"Ipns": {
"RecordLifetime": "48h",
"RepublishPeriod": "",
"ResolveCacheSize": 128,
"UsePubsub": true,
"MaxCacheTTL": "1m"
},
"Migration": {
"DownloadSources": [],
"Keep": ""
},
"Mounts": {
"FuseAllowOther": false,
"IPFS": "/ipfs",
"IPNS": "/ipns"
},
"Peering": {
"Peers": [
{
"ID": "1xxx",
"Addrs": ["/ip4/xxx"]
},
{
"ID": "2xxx",
"Addrs": ["/ip4/xxx"]
}
]
},
"Pinning": {
"RemoteServices": {}
},
"Plugins": {
"Plugins": null
},
"Provider": {
"Strategy": ""
},
"Pubsub": {
"DisableSigning": false,
"Router": ""
},
"Reprovider": {},
"Routing": {
"AcceleratedDHTClient": true,
"Methods": null,
"Routers": null
},
"Swarm": {
"AddrFilters": null,
"ConnMgr": {
"Enabled": true,
"LowWater": 2000,
"HighWater": 3000,
"GracePeriod": "3m"
},
"DisableBandwidthMetrics": false,
"DisableNatPortMap": true,
"RelayClient": {},
"RelayService": {},
"ResourceMgr": {},
"Transports": {
"Multiplexers": {},
"Network": {},
"Security": {}
}
}
}
Master VM setup:
- vCPU: 8
- Memory: 32 GiB
- Network Performance: Up to 12.5 Gbps