1 /* 2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 14 #include <netinet/in.h> 15 #include <netinet/ip.h> 16 #include <netinet/tcp.h> 17 #include <new> 18 #include <signal.h> 19 #include <stdlib.h> 20 #include <string.h> 21 #include <stdint.h> 22 23 #include <KernelExport.h> 24 #include <Select.h> 25 26 #include <net_buffer.h> 27 #include <net_datalink.h> 28 #include <net_stat.h> 29 #include <NetBufferUtilities.h> 30 #include <NetUtilities.h> 31 32 #include <lock.h> 33 #include <tracing.h> 34 #include <util/AutoLock.h> 35 #include <util/list.h> 36 37 #include "EndpointManager.h" 38 39 40 // References: 41 // - RFC 793 - Transmission Control Protocol 42 // - RFC 813 - Window and Acknowledgement Strategy in TCP 43 // - RFC 1337 - TIME_WAIT Assassination Hazards in TCP 44 // 45 // Things this implementation currently doesn't implement: 46 // - TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, 47 // RFC 2001, RFC 2581, RFC 3042 48 // - NewReno Modification to TCP's Fast Recovery, RFC 2582 49 // - Explicit Congestion Notification (ECN), RFC 3168 50 // - SYN-Cache 51 // - SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517 52 // - Forward RTO-Recovery, RFC 4138 53 // - Time-Wait hash instead of keeping sockets alive 54 // 55 // Things incomplete in this implementation: 56 // - TCP Extensions for High Performance, RFC 1323 - RTTM, PAWS 57 58 #define PrintAddress(address) \ 59 AddressString(Domain(), address, true).Data() 60 61 //#define TRACE_TCP 62 //#define PROBE_TCP 63 64 #ifdef TRACE_TCP 65 // the space before ', ##args' is important in order for this to work with cpp 2.95 66 # define TRACE(format, args...) dprintf("%" B_PRId32 ": TCP [%" \ 67 B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \ 68 system_time(), this, name_for_state(fState) , ##args) 69 #else 70 # define TRACE(args...) do { } while (0) 71 #endif 72 73 #ifdef PROBE_TCP 74 # define PROBE(buffer, window) \ 75 dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \ 76 " suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \ 77 B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \ 78 B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \ 79 system_time(), PrintAddress(buffer->source), \ 80 PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \ 81 fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \ 82 window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \ 83 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 84 #else 85 # define PROBE(buffer, window) do { } while (0) 86 #endif 87 88 #if TCP_TRACING 89 namespace TCPTracing { 90 91 class Receive : public AbstractTraceEntry { 92 public: 93 Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window, 94 net_buffer* buffer) 95 : 96 fEndpoint(endpoint), 97 fBuffer(buffer), 98 fBufferSize(buffer->size), 99 fSequence(segment.sequence), 100 fAcknowledge(segment.acknowledge), 101 fWindow(window), 102 fState(endpoint->State()), 103 fFlags(segment.flags) 104 { 105 Initialized(); 106 } 107 108 virtual void AddDump(TraceOutput& out) 109 { 110 out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), " 111 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 112 ", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer, 113 fBufferSize, fFlags, fSequence, fAcknowledge, fWindow); 114 } 115 116 protected: 117 TCPEndpoint* fEndpoint; 118 net_buffer* fBuffer; 119 uint32 fBufferSize; 120 uint32 fSequence; 121 uint32 fAcknowledge; 122 uint32 fWindow; 123 tcp_state fState; 124 uint8 fFlags; 125 }; 126 127 class Send : public AbstractTraceEntry { 128 public: 129 Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer, 130 tcp_sequence firstSequence, tcp_sequence lastSequence) 131 : 132 fEndpoint(endpoint), 133 fBuffer(buffer), 134 fBufferSize(buffer->size), 135 fSequence(segment.sequence), 136 fAcknowledge(segment.acknowledge), 137 fFirstSequence(firstSequence.Number()), 138 fLastSequence(lastSequence.Number()), 139 fState(endpoint->State()), 140 fFlags(segment.flags) 141 { 142 Initialized(); 143 } 144 145 virtual void AddDump(TraceOutput& out) 146 { 147 out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), " 148 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 149 ", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint, 150 name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence, 151 fAcknowledge, fFirstSequence, fLastSequence); 152 } 153 154 protected: 155 TCPEndpoint* fEndpoint; 156 net_buffer* fBuffer; 157 uint32 fBufferSize; 158 uint32 fSequence; 159 uint32 fAcknowledge; 160 uint32 fFirstSequence; 161 uint32 fLastSequence; 162 tcp_state fState; 163 uint8 fFlags; 164 }; 165 166 class State : public AbstractTraceEntry { 167 public: 168 State(TCPEndpoint* endpoint) 169 : 170 fEndpoint(endpoint), 171 fState(endpoint->State()) 172 { 173 Initialized(); 174 } 175 176 virtual void AddDump(TraceOutput& out) 177 { 178 out.Print("tcp:%p (%12s) state change", fEndpoint, 179 name_for_state(fState)); 180 } 181 182 protected: 183 TCPEndpoint* fEndpoint; 184 tcp_state fState; 185 }; 186 187 class Spawn : public AbstractTraceEntry { 188 public: 189 Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint) 190 : 191 fListeningEndpoint(listeningEndpoint), 192 fSpawnedEndpoint(spawnedEndpoint) 193 { 194 Initialized(); 195 } 196 197 virtual void AddDump(TraceOutput& out) 198 { 199 out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint); 200 } 201 202 protected: 203 TCPEndpoint* fListeningEndpoint; 204 TCPEndpoint* fSpawnedEndpoint; 205 }; 206 207 class Error : public AbstractTraceEntry { 208 public: 209 Error(TCPEndpoint* endpoint, const char* error, int32 line) 210 : 211 fEndpoint(endpoint), 212 fLine(line), 213 fError(error), 214 fState(endpoint->State()) 215 { 216 Initialized(); 217 } 218 219 virtual void AddDump(TraceOutput& out) 220 { 221 out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint, 222 name_for_state(fState), fLine, fError); 223 } 224 225 protected: 226 TCPEndpoint* fEndpoint; 227 int32 fLine; 228 const char* fError; 229 tcp_state fState; 230 }; 231 232 class TimerSet : public AbstractTraceEntry { 233 public: 234 TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout) 235 : 236 fEndpoint(endpoint), 237 fWhich(which), 238 fTimeout(timeout), 239 fState(endpoint->State()) 240 { 241 Initialized(); 242 } 243 244 virtual void AddDump(TraceOutput& out) 245 { 246 out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint, 247 name_for_state(fState), fWhich, fTimeout); 248 } 249 250 protected: 251 TCPEndpoint* fEndpoint; 252 const char* fWhich; 253 bigtime_t fTimeout; 254 tcp_state fState; 255 }; 256 257 class TimerTriggered : public AbstractTraceEntry { 258 public: 259 TimerTriggered(TCPEndpoint* endpoint, const char* which) 260 : 261 fEndpoint(endpoint), 262 fWhich(which), 263 fState(endpoint->State()) 264 { 265 Initialized(); 266 } 267 268 virtual void AddDump(TraceOutput& out) 269 { 270 out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint, 271 name_for_state(fState), fWhich); 272 } 273 274 protected: 275 TCPEndpoint* fEndpoint; 276 const char* fWhich; 277 tcp_state fState; 278 }; 279 280 class APICall : public AbstractTraceEntry { 281 public: 282 APICall(TCPEndpoint* endpoint, const char* which) 283 : 284 fEndpoint(endpoint), 285 fWhich(which), 286 fState(endpoint->State()) 287 { 288 Initialized(); 289 } 290 291 virtual void AddDump(TraceOutput& out) 292 { 293 out.Print("tcp:%p (%12s) api call: %s", fEndpoint, 294 name_for_state(fState), fWhich); 295 } 296 297 protected: 298 TCPEndpoint* fEndpoint; 299 const char* fWhich; 300 tcp_state fState; 301 }; 302 303 } // namespace TCPTracing 304 305 # define T(x) new(std::nothrow) TCPTracing::x 306 #else 307 # define T(x) 308 #endif // TCP_TRACING 309 310 311 // constants for the fFlags field 312 enum { 313 FLAG_OPTION_WINDOW_SCALE = 0x01, 314 FLAG_OPTION_TIMESTAMP = 0x02, 315 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 316 // That is, what is expected from accept() after a shutdown() 317 // is performed on a listen()ing socket. 318 FLAG_NO_RECEIVE = 0x04, 319 FLAG_CLOSED = 0x08, 320 FLAG_DELETE_ON_CLOSE = 0x10, 321 FLAG_LOCAL = 0x20, 322 FLAG_RECOVERY = 0x40 323 }; 324 325 326 static const int kTimestampFactor = 1000; 327 // conversion factor between usec system time and msec tcp time 328 329 330 static inline bigtime_t 331 absolute_timeout(bigtime_t timeout) 332 { 333 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 334 return timeout; 335 336 return timeout + system_time(); 337 } 338 339 340 static inline status_t 341 posix_error(status_t error) 342 { 343 if (error == B_TIMED_OUT) 344 return B_WOULD_BLOCK; 345 346 return error; 347 } 348 349 350 static inline bool 351 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext, 352 uint32 receiveWindow) 353 { 354 return sequence >= receiveNext && sequence < (receiveNext + receiveWindow); 355 } 356 357 358 static inline bool 359 segment_in_sequence(const tcp_segment_header& segment, int size, 360 const tcp_sequence& receiveNext, uint32 receiveWindow) 361 { 362 tcp_sequence sequence(segment.sequence); 363 if (size == 0) { 364 if (receiveWindow == 0) 365 return sequence == receiveNext; 366 return in_window(sequence, receiveNext, receiveWindow); 367 } else { 368 if (receiveWindow == 0) 369 return false; 370 return in_window(sequence, receiveNext, receiveWindow) 371 || in_window(sequence + size - 1, receiveNext, receiveWindow); 372 } 373 } 374 375 376 static inline bool 377 is_writable(tcp_state state) 378 { 379 return state == ESTABLISHED || state == FINISH_RECEIVED; 380 } 381 382 383 static inline bool 384 is_establishing(tcp_state state) 385 { 386 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED; 387 } 388 389 390 static inline uint32 tcp_now() 391 { 392 return system_time() / kTimestampFactor; 393 } 394 395 396 static inline uint32 tcp_diff_timestamp(uint32 base) 397 { 398 uint32 now = tcp_now(); 399 400 if (now > base) 401 return now - base; 402 403 return now + UINT_MAX - base; 404 } 405 406 407 static inline bool 408 state_needs_finish(int32 state) 409 { 410 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 411 || state == FINISH_SENT || state == CLOSING; 412 } 413 414 415 // #pragma mark - 416 417 418 TCPEndpoint::TCPEndpoint(net_socket* socket) 419 : 420 ProtocolSocket(socket), 421 fManager(NULL), 422 fOptions(0), 423 fSendWindowShift(0), 424 fReceiveWindowShift(0), 425 fSendUnacknowledged(0), 426 fSendNext(0), 427 fSendMax(0), 428 fSendUrgentOffset(0), 429 fSendWindow(0), 430 fSendMaxWindow(0), 431 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 432 fSendMaxSegments(0), 433 fSendQueue(socket->send.buffer_size), 434 fInitialSendSequence(0), 435 fPreviousHighestAcknowledge(0), 436 fDuplicateAcknowledgeCount(0), 437 fPreviousFlightSize(0), 438 fRecover(0), 439 fRoute(NULL), 440 fReceiveNext(0), 441 fReceiveMaxAdvertised(0), 442 fReceiveWindow(socket->receive.buffer_size), 443 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 444 fReceiveQueue(socket->receive.buffer_size), 445 fSmoothedRoundTripTime(0), 446 fRoundTripVariation(0), 447 fSendTime(0), 448 fRoundTripStartSequence(0), 449 fRetransmitTimeout(TCP_INITIAL_RTT), 450 fReceivedTimestamp(0), 451 fCongestionWindow(0), 452 fSlowStartThreshold(0), 453 fState(CLOSED), 454 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP) 455 { 456 // TODO: to be replaced with a real read/write locking strategy! 457 mutex_init(&fLock, "tcp lock"); 458 459 fReceiveCondition.Init(this, "tcp receive"); 460 fSendCondition.Init(this, "tcp send"); 461 462 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 463 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, 464 this); 465 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 466 TCPEndpoint::_DelayedAcknowledgeTimer, this); 467 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, 468 this); 469 470 T(APICall(this, "constructor")); 471 } 472 473 474 TCPEndpoint::~TCPEndpoint() 475 { 476 mutex_lock(&fLock); 477 478 T(APICall(this, "destructor")); 479 480 _CancelConnectionTimers(); 481 gStackModule->cancel_timer(&fTimeWaitTimer); 482 T(TimerSet(this, "time-wait", -1)); 483 484 if (fManager != NULL) { 485 fManager->Unbind(this); 486 put_endpoint_manager(fManager); 487 } 488 489 mutex_destroy(&fLock); 490 491 // we need to wait for all timers to return 492 gStackModule->wait_for_timer(&fRetransmitTimer); 493 gStackModule->wait_for_timer(&fPersistTimer); 494 gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer); 495 gStackModule->wait_for_timer(&fTimeWaitTimer); 496 497 gDatalinkModule->put_route(Domain(), fRoute); 498 } 499 500 501 status_t 502 TCPEndpoint::InitCheck() const 503 { 504 return B_OK; 505 } 506 507 508 // #pragma mark - protocol API 509 510 511 status_t 512 TCPEndpoint::Open() 513 { 514 TRACE("Open()"); 515 T(APICall(this, "open")); 516 517 status_t status = ProtocolSocket::Open(); 518 if (status < B_OK) 519 return status; 520 521 fManager = get_endpoint_manager(Domain()); 522 if (fManager == NULL) 523 return EAFNOSUPPORT; 524 525 return B_OK; 526 } 527 528 529 status_t 530 TCPEndpoint::Close() 531 { 532 MutexLocker locker(fLock); 533 534 TRACE("Close()"); 535 T(APICall(this, "close")); 536 537 if (fState == LISTEN) 538 delete_sem(fAcceptSemaphore); 539 540 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 541 // TODO: what about linger in case of SYNCHRONIZE_SENT? 542 fState = CLOSED; 543 T(State(this)); 544 return B_OK; 545 } 546 547 status_t status = _Disconnect(true); 548 if (status != B_OK) 549 return status; 550 551 if (socket->options & SO_LINGER) { 552 TRACE("Close(): Lingering for %i secs", socket->linger); 553 554 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 555 556 while (fSendQueue.Used() > 0) { 557 status = _WaitForCondition(fSendCondition, locker, maximum); 558 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 559 break; 560 else if (status < B_OK) 561 return status; 562 } 563 564 TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE 565 " bytes.", fSendQueue.Used()); 566 } 567 return B_OK; 568 } 569 570 571 void 572 TCPEndpoint::Free() 573 { 574 MutexLocker _(fLock); 575 576 TRACE("Free()"); 577 T(APICall(this, "free")); 578 579 if (fState <= SYNCHRONIZE_SENT) 580 return; 581 582 // we are only interested in the timer, not in changing state 583 _EnterTimeWait(); 584 585 fFlags |= FLAG_CLOSED; 586 if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) { 587 // we'll be freed later when the 2MSL timer expires 588 gSocketModule->acquire_socket(socket); 589 } 590 } 591 592 593 /*! Creates and sends a synchronize packet to /a address, and then waits 594 until the connection has been established or refused. 595 */ 596 status_t 597 TCPEndpoint::Connect(const sockaddr* address) 598 { 599 if (!AddressModule()->is_same_family(address)) 600 return EAFNOSUPPORT; 601 602 MutexLocker locker(fLock); 603 604 TRACE("Connect() on address %s", PrintAddress(address)); 605 T(APICall(this, "connect")); 606 607 if (gStackModule->is_restarted_syscall()) { 608 bigtime_t timeout = gStackModule->restore_syscall_restart_timeout(); 609 status_t status = _WaitForEstablished(locker, timeout); 610 TRACE(" Connect(): Connection complete: %s (timeout was %" 611 B_PRIdBIGTIME ")", strerror(status), timeout); 612 return posix_error(status); 613 } 614 615 // Can only call connect() from CLOSED or LISTEN states 616 // otherwise endpoint is considered already connected 617 if (fState == LISTEN) { 618 // this socket is about to connect; remove pending connections in the backlog 619 gSocketModule->set_max_backlog(socket, 0); 620 } else if (fState == ESTABLISHED) { 621 return EISCONN; 622 } else if (fState != CLOSED) 623 return EALREADY; 624 625 // consider destination address INADDR_ANY as INADDR_LOOPBACK 626 sockaddr_storage _address; 627 if (AddressModule()->is_empty_address(address, false)) { 628 AddressModule()->get_loopback_address((sockaddr *)&_address); 629 // for IPv4 and IPv6 the port is at the same offset 630 ((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port; 631 address = (sockaddr *)&_address; 632 } 633 634 status_t status = _PrepareSendPath(address); 635 if (status < B_OK) 636 return status; 637 638 TRACE(" Connect(): starting 3-way handshake..."); 639 640 fState = SYNCHRONIZE_SENT; 641 T(State(this)); 642 643 // send SYN 644 status = _SendQueued(); 645 if (status != B_OK) { 646 _Close(); 647 return status; 648 } 649 650 // If we are running over Loopback, after _SendQueued() returns we 651 // may be in ESTABLISHED already. 652 if (fState == ESTABLISHED) { 653 TRACE(" Connect() completed after _SendQueued()"); 654 return B_OK; 655 } 656 657 // wait until 3-way handshake is complete (if needed) 658 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 659 if (timeout == 0) { 660 // we're a non-blocking socket 661 TRACE(" Connect() delayed, return EINPROGRESS"); 662 return EINPROGRESS; 663 } 664 665 bigtime_t absoluteTimeout = absolute_timeout(timeout); 666 gStackModule->store_syscall_restart_timeout(absoluteTimeout); 667 668 status = _WaitForEstablished(locker, absoluteTimeout); 669 TRACE(" Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME 670 ")", strerror(status), timeout); 671 return posix_error(status); 672 } 673 674 675 status_t 676 TCPEndpoint::Accept(struct net_socket** _acceptedSocket) 677 { 678 MutexLocker locker(fLock); 679 680 TRACE("Accept()"); 681 T(APICall(this, "accept")); 682 683 status_t status; 684 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 685 if (gStackModule->is_restarted_syscall()) 686 timeout = gStackModule->restore_syscall_restart_timeout(); 687 else 688 gStackModule->store_syscall_restart_timeout(timeout); 689 690 do { 691 locker.Unlock(); 692 693 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 694 | B_CAN_INTERRUPT, timeout); 695 if (status != B_OK) { 696 if (status == B_TIMED_OUT && socket->receive.timeout == 0) 697 return B_WOULD_BLOCK; 698 699 return status; 700 } 701 702 locker.Lock(); 703 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 704 #ifdef TRACE_TCP 705 if (status == B_OK) 706 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 707 #endif 708 } while (status != B_OK); 709 710 return status; 711 } 712 713 714 status_t 715 TCPEndpoint::Bind(const sockaddr *address) 716 { 717 if (address == NULL) 718 return B_BAD_VALUE; 719 720 MutexLocker lock(fLock); 721 722 TRACE("Bind() on address %s", PrintAddress(address)); 723 T(APICall(this, "bind")); 724 725 if (fState != CLOSED) 726 return EISCONN; 727 728 return fManager->Bind(this, address); 729 } 730 731 732 status_t 733 TCPEndpoint::Unbind(struct sockaddr *address) 734 { 735 MutexLocker _(fLock); 736 737 TRACE("Unbind()"); 738 T(APICall(this, "unbind")); 739 740 return fManager->Unbind(this); 741 } 742 743 744 status_t 745 TCPEndpoint::Listen(int count) 746 { 747 MutexLocker _(fLock); 748 749 TRACE("Listen()"); 750 T(APICall(this, "listen")); 751 752 if (fState != CLOSED && fState != LISTEN) 753 return B_BAD_VALUE; 754 755 if (fState == CLOSED) { 756 fAcceptSemaphore = create_sem(0, "tcp accept"); 757 if (fAcceptSemaphore < B_OK) 758 return ENOBUFS; 759 760 status_t status = fManager->SetPassive(this); 761 if (status != B_OK) { 762 delete_sem(fAcceptSemaphore); 763 fAcceptSemaphore = -1; 764 return status; 765 } 766 } 767 768 gSocketModule->set_max_backlog(socket, count); 769 770 fState = LISTEN; 771 T(State(this)); 772 return B_OK; 773 } 774 775 776 status_t 777 TCPEndpoint::Shutdown(int direction) 778 { 779 MutexLocker lock(fLock); 780 781 TRACE("Shutdown(%i)", direction); 782 T(APICall(this, "shutdown")); 783 784 if (direction == SHUT_RD || direction == SHUT_RDWR) 785 fFlags |= FLAG_NO_RECEIVE; 786 787 if (direction == SHUT_WR || direction == SHUT_RDWR) { 788 // TODO: That's not correct. After read/write shutting down the socket 789 // one should still be able to read previously arrived data. 790 _Disconnect(false); 791 } 792 793 return B_OK; 794 } 795 796 797 /*! Puts data contained in \a buffer into send buffer */ 798 status_t 799 TCPEndpoint::SendData(net_buffer *buffer) 800 { 801 MutexLocker lock(fLock); 802 803 TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32 804 ") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer, 805 buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free()); 806 T(APICall(this, "senddata")); 807 808 uint32 flags = buffer->flags; 809 810 if (fState == CLOSED) 811 return ENOTCONN; 812 if (fState == LISTEN) 813 return EDESTADDRREQ; 814 if (!is_writable(fState) && !is_establishing(fState)) { 815 // we only send signals when called from userland 816 if (gStackModule->is_syscall() && (flags & MSG_NOSIGNAL) == 0) 817 send_signal(find_thread(NULL), SIGPIPE); 818 return EPIPE; 819 } 820 821 size_t left = buffer->size; 822 823 bigtime_t timeout = absolute_timeout(socket->send.timeout); 824 if (gStackModule->is_restarted_syscall()) 825 timeout = gStackModule->restore_syscall_restart_timeout(); 826 else 827 gStackModule->store_syscall_restart_timeout(timeout); 828 829 while (left > 0) { 830 while (fSendQueue.Free() < socket->send.low_water_mark) { 831 // wait until enough space is available 832 status_t status = _WaitForCondition(fSendCondition, lock, timeout); 833 if (status < B_OK) { 834 TRACE(" SendData() returning %s (%d)", 835 strerror(posix_error(status)), (int)posix_error(status)); 836 return posix_error(status); 837 } 838 839 if (!is_writable(fState) && !is_establishing(fState)) { 840 // we only send signals when called from userland 841 if (gStackModule->is_syscall()) 842 send_signal(find_thread(NULL), SIGPIPE); 843 return EPIPE; 844 } 845 } 846 847 size_t size = fSendQueue.Free(); 848 if (size < left) { 849 // we need to split the original buffer 850 net_buffer* clone = gBufferModule->clone(buffer, false); 851 // TODO: add offset/size parameter to net_buffer::clone() or 852 // even a move_data() function, as this is a bit inefficient 853 if (clone == NULL) 854 return ENOBUFS; 855 856 status_t status = gBufferModule->trim(clone, size); 857 if (status != B_OK) { 858 gBufferModule->free(clone); 859 return status; 860 } 861 862 gBufferModule->remove_header(buffer, size); 863 left -= size; 864 fSendQueue.Add(clone); 865 } else { 866 left -= buffer->size; 867 fSendQueue.Add(buffer); 868 } 869 } 870 871 TRACE(" SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used()); 872 873 bool force = false; 874 if ((flags & MSG_OOB) != 0) { 875 fSendUrgentOffset = fSendQueue.LastSequence(); 876 // RFC 961 specifies that the urgent offset points to the last 877 // byte of urgent data. However, this is commonly implemented as 878 // here, ie. it points to the first byte after the urgent data. 879 force = true; 880 } 881 if ((flags & MSG_EOF) != 0) 882 _Disconnect(false); 883 884 if (fState == ESTABLISHED || fState == FINISH_RECEIVED) 885 _SendQueued(force); 886 887 return B_OK; 888 } 889 890 891 ssize_t 892 TCPEndpoint::SendAvailable() 893 { 894 MutexLocker locker(fLock); 895 896 ssize_t available; 897 898 if (is_writable(fState)) 899 available = fSendQueue.Free(); 900 else if (is_establishing(fState)) 901 available = 0; 902 else 903 available = EPIPE; 904 905 TRACE("SendAvailable(): %" B_PRIdSSIZE, available); 906 T(APICall(this, "sendavailable")); 907 return available; 908 } 909 910 911 status_t 912 TCPEndpoint::FillStat(net_stat *stat) 913 { 914 MutexLocker _(fLock); 915 916 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 917 stat->receive_queue_size = fReceiveQueue.Available(); 918 stat->send_queue_size = fSendQueue.Used(); 919 920 return B_OK; 921 } 922 923 924 status_t 925 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 926 { 927 MutexLocker locker(fLock); 928 929 TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes, 930 flags); 931 T(APICall(this, "readdata")); 932 933 *_buffer = NULL; 934 935 if (fState == CLOSED) 936 return ENOTCONN; 937 938 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 939 if (gStackModule->is_restarted_syscall()) 940 timeout = gStackModule->restore_syscall_restart_timeout(); 941 else 942 gStackModule->store_syscall_restart_timeout(timeout); 943 944 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 945 if (flags & MSG_DONTWAIT) 946 return B_WOULD_BLOCK; 947 948 status_t status = _WaitForEstablished(locker, timeout); 949 if (status < B_OK) 950 return posix_error(status); 951 } 952 953 size_t dataNeeded = socket->receive.low_water_mark; 954 955 // When MSG_WAITALL is set then the function should block 956 // until the full amount of data can be returned. 957 if (flags & MSG_WAITALL) 958 dataNeeded = numBytes; 959 960 // TODO: add support for urgent data (MSG_OOB) 961 962 while (true) { 963 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 964 || fState == TIME_WAIT) { 965 // ``Connection closing''. 966 return B_OK; 967 } 968 969 if (fReceiveQueue.Available() > 0) { 970 if (fReceiveQueue.Available() >= dataNeeded 971 || (fReceiveQueue.PushedData() > 0 972 && fReceiveQueue.PushedData() >= fReceiveQueue.Available())) 973 break; 974 } else if (fState == FINISH_RECEIVED) { 975 // ``If no text is awaiting delivery, the RECEIVE will 976 // get a Connection closing''. 977 return B_OK; 978 } 979 980 if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0) 981 return B_WOULD_BLOCK; 982 983 if ((fFlags & FLAG_NO_RECEIVE) != 0) 984 return B_OK; 985 986 status_t status = _WaitForCondition(fReceiveCondition, locker, timeout); 987 if (status < B_OK) { 988 // The Open Group base specification mentions that EINTR should be 989 // returned if the recv() is interrupted before _any data_ is 990 // available. So we actually check if there is data, and if so, 991 // push it to the user. 992 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 993 && fReceiveQueue.Available() > 0) 994 break; 995 996 return posix_error(status); 997 } 998 } 999 1000 TRACE(" ReadData(): %" B_PRIuSIZE " are available.", 1001 fReceiveQueue.Available()); 1002 1003 if (numBytes < fReceiveQueue.Available()) 1004 fReceiveCondition.NotifyAll(); 1005 1006 bool clone = (flags & MSG_PEEK) != 0; 1007 1008 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 1009 1010 TRACE(" ReadData(): %" B_PRIuSIZE " bytes kept.", 1011 fReceiveQueue.Available()); 1012 1013 // if we are opening the window, check if we should send an ACK 1014 if (!clone) 1015 SendAcknowledge(false); 1016 1017 return receivedBytes; 1018 } 1019 1020 1021 ssize_t 1022 TCPEndpoint::ReadAvailable() 1023 { 1024 MutexLocker locker(fLock); 1025 1026 TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData()); 1027 T(APICall(this, "readavailable")); 1028 1029 return _AvailableData(); 1030 } 1031 1032 1033 status_t 1034 TCPEndpoint::SetSendBufferSize(size_t length) 1035 { 1036 MutexLocker _(fLock); 1037 fSendQueue.SetMaxBytes(length); 1038 return B_OK; 1039 } 1040 1041 1042 status_t 1043 TCPEndpoint::SetReceiveBufferSize(size_t length) 1044 { 1045 MutexLocker _(fLock); 1046 fReceiveQueue.SetMaxBytes(length); 1047 return B_OK; 1048 } 1049 1050 1051 status_t 1052 TCPEndpoint::GetOption(int option, void* _value, int* _length) 1053 { 1054 if (*_length != sizeof(int)) 1055 return B_BAD_VALUE; 1056 1057 int* value = (int*)_value; 1058 1059 switch (option) { 1060 case TCP_NODELAY: 1061 if ((fOptions & TCP_NODELAY) != 0) 1062 *value = 1; 1063 else 1064 *value = 0; 1065 return B_OK; 1066 1067 case TCP_MAXSEG: 1068 *value = fReceiveMaxSegmentSize; 1069 return B_OK; 1070 1071 default: 1072 return B_BAD_VALUE; 1073 } 1074 } 1075 1076 1077 status_t 1078 TCPEndpoint::SetOption(int option, const void* _value, int length) 1079 { 1080 if (option != TCP_NODELAY) 1081 return B_BAD_VALUE; 1082 1083 if (length != sizeof(int)) 1084 return B_BAD_VALUE; 1085 1086 const int* value = (const int*)_value; 1087 1088 MutexLocker _(fLock); 1089 if (*value) 1090 fOptions |= TCP_NODELAY; 1091 else 1092 fOptions &= ~TCP_NODELAY; 1093 1094 return B_OK; 1095 } 1096 1097 1098 // #pragma mark - misc 1099 1100 1101 bool 1102 TCPEndpoint::IsBound() const 1103 { 1104 return !LocalAddress().IsEmpty(true); 1105 } 1106 1107 1108 bool 1109 TCPEndpoint::IsLocal() const 1110 { 1111 return (fFlags & FLAG_LOCAL) != 0; 1112 } 1113 1114 1115 status_t 1116 TCPEndpoint::DelayedAcknowledge() 1117 { 1118 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) { 1119 // timer was active, send an ACK now (with the exception above, 1120 // we send every other ACK) 1121 T(TimerSet(this, "delayed ack", -1)); 1122 return SendAcknowledge(true); 1123 } 1124 1125 gStackModule->set_timer(&fDelayedAcknowledgeTimer, 1126 TCP_DELAYED_ACKNOWLEDGE_TIMEOUT); 1127 T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT)); 1128 return B_OK; 1129 } 1130 1131 1132 status_t 1133 TCPEndpoint::SendAcknowledge(bool force) 1134 { 1135 return _SendQueued(force, 0); 1136 } 1137 1138 1139 void 1140 TCPEndpoint::_StartPersistTimer() 1141 { 1142 gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT); 1143 T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT)); 1144 } 1145 1146 1147 void 1148 TCPEndpoint::_EnterTimeWait() 1149 { 1150 TRACE("_EnterTimeWait()"); 1151 1152 if (fState == TIME_WAIT) { 1153 _CancelConnectionTimers(); 1154 1155 if (IsLocal()) { 1156 // we do not use TIME_WAIT state for local connections 1157 fFlags |= FLAG_DELETE_ON_CLOSE; 1158 return; 1159 } 1160 } 1161 1162 _UpdateTimeWait(); 1163 } 1164 1165 1166 void 1167 TCPEndpoint::_UpdateTimeWait() 1168 { 1169 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 1170 T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1)); 1171 } 1172 1173 1174 void 1175 TCPEndpoint::_CancelConnectionTimers() 1176 { 1177 gStackModule->cancel_timer(&fRetransmitTimer); 1178 T(TimerSet(this, "retransmit", -1)); 1179 gStackModule->cancel_timer(&fPersistTimer); 1180 T(TimerSet(this, "persist", -1)); 1181 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 1182 T(TimerSet(this, "delayed ack", -1)); 1183 } 1184 1185 1186 /*! Sends the FIN flag to the peer when the connection is still open. 1187 Moves the endpoint to the next state depending on where it was. 1188 */ 1189 status_t 1190 TCPEndpoint::_Disconnect(bool closing) 1191 { 1192 tcp_state previousState = fState; 1193 1194 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1195 fState = FINISH_SENT; 1196 else if (fState == FINISH_RECEIVED) 1197 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1198 else 1199 return B_OK; 1200 1201 T(State(this)); 1202 1203 status_t status = _SendQueued(); 1204 if (status != B_OK) { 1205 fState = previousState; 1206 T(State(this)); 1207 return status; 1208 } 1209 1210 return B_OK; 1211 } 1212 1213 1214 void 1215 TCPEndpoint::_MarkEstablished() 1216 { 1217 fState = ESTABLISHED; 1218 T(State(this)); 1219 1220 gSocketModule->set_connected(socket); 1221 if (gSocketModule->has_parent(socket)) 1222 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1223 1224 fSendCondition.NotifyAll(); 1225 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free()); 1226 } 1227 1228 1229 status_t 1230 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout) 1231 { 1232 // TODO: Checking for CLOSED seems correct, but breaks several neon tests. 1233 // When investigating this, also have a look at _Close() and _HandleReset(). 1234 while (fState < ESTABLISHED/* && fState != CLOSED*/) { 1235 if (socket->error != B_OK) 1236 return socket->error; 1237 1238 status_t status = _WaitForCondition(fSendCondition, locker, timeout); 1239 if (status < B_OK) 1240 return status; 1241 } 1242 1243 return B_OK; 1244 } 1245 1246 1247 // #pragma mark - receive 1248 1249 1250 void 1251 TCPEndpoint::_Close() 1252 { 1253 _CancelConnectionTimers(); 1254 fState = CLOSED; 1255 T(State(this)); 1256 1257 fFlags |= FLAG_DELETE_ON_CLOSE; 1258 1259 fSendCondition.NotifyAll(); 1260 _NotifyReader(); 1261 1262 if (gSocketModule->has_parent(socket)) { 1263 // We still have a parent - obviously, we haven't been accepted yet, 1264 // so no one could ever close us. 1265 _CancelConnectionTimers(); 1266 gSocketModule->set_aborted(socket); 1267 } 1268 } 1269 1270 1271 void 1272 TCPEndpoint::_HandleReset(status_t error) 1273 { 1274 socket->error = error; 1275 _Close(); 1276 1277 gSocketModule->notify(socket, B_SELECT_WRITE, error); 1278 gSocketModule->notify(socket, B_SELECT_ERROR, error); 1279 } 1280 1281 1282 void 1283 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1284 { 1285 if (fDuplicateAcknowledgeCount == 0) 1286 fPreviousFlightSize = (fSendMax - fSendUnacknowledged).Number(); 1287 1288 if (++fDuplicateAcknowledgeCount < 3) { 1289 if (fSendQueue.Available(fSendMax) != 0 && fSendWindow != 0) { 1290 fSendNext = fSendMax; 1291 fCongestionWindow += fDuplicateAcknowledgeCount * fSendMaxSegmentSize; 1292 _SendQueued(); 1293 TRACE("_DuplicateAcknowledge(): packet sent under limited transmit on receipt of dup ack"); 1294 fCongestionWindow -= fDuplicateAcknowledgeCount * fSendMaxSegmentSize; 1295 } 1296 } 1297 1298 if (fDuplicateAcknowledgeCount == 3) { 1299 if ((segment.acknowledge - 1) > fRecover || (fCongestionWindow > fSendMaxSegmentSize && 1300 (fSendUnacknowledged - fPreviousHighestAcknowledge) <= 4 * fSendMaxSegmentSize)) { 1301 fFlags |= FLAG_RECOVERY; 1302 fRecover = fSendMax.Number() - 1; 1303 fSlowStartThreshold = max_c(fPreviousFlightSize / 2, 2 * fSendMaxSegmentSize); 1304 fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize; 1305 fSendNext = segment.acknowledge; 1306 _SendQueued(); 1307 TRACE("_DuplicateAcknowledge(): packet sent under fast restransmit on the receipt of 3rd dup ack"); 1308 } 1309 } else if (fDuplicateAcknowledgeCount > 3) { 1310 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 1311 if ((fDuplicateAcknowledgeCount - 3) * fSendMaxSegmentSize <= flightSize) 1312 fCongestionWindow += fSendMaxSegmentSize; 1313 if (fSendQueue.Available(fSendMax) != 0) { 1314 fSendNext = fSendMax; 1315 _SendQueued(); 1316 } 1317 } 1318 } 1319 1320 1321 void 1322 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment, 1323 size_t segmentLength) 1324 { 1325 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1326 tcp_sequence sequence(segment.sequence); 1327 1328 if (fLastAcknowledgeSent >= sequence 1329 && fLastAcknowledgeSent < (sequence + segmentLength)) 1330 fReceivedTimestamp = segment.timestamp_value; 1331 } 1332 } 1333 1334 1335 ssize_t 1336 TCPEndpoint::_AvailableData() const 1337 { 1338 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1339 // the application of FLAG_NO_RECEIVE in listen()ing 1340 // sockets. 1341 if (fState == LISTEN) 1342 return gSocketModule->count_connected(socket); 1343 if (fState == SYNCHRONIZE_SENT) 1344 return 0; 1345 1346 ssize_t availableData = fReceiveQueue.Available(); 1347 1348 if (availableData == 0 && !_ShouldReceive()) 1349 return ENOTCONN; 1350 1351 return availableData; 1352 } 1353 1354 1355 void 1356 TCPEndpoint::_NotifyReader() 1357 { 1358 fReceiveCondition.NotifyAll(); 1359 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1360 } 1361 1362 1363 bool 1364 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer) 1365 { 1366 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1367 // Remember the position of the finish received flag 1368 fFinishReceived = true; 1369 fFinishReceivedAt = segment.sequence + buffer->size; 1370 } 1371 1372 fReceiveQueue.Add(buffer, segment.sequence); 1373 fReceiveNext = fReceiveQueue.NextSequence(); 1374 1375 if (fFinishReceived) { 1376 // Set or reset the finish flag on the current segment 1377 if (fReceiveNext < fFinishReceivedAt) 1378 segment.flags &= ~TCP_FLAG_FINISH; 1379 else 1380 segment.flags |= TCP_FLAG_FINISH; 1381 } 1382 1383 TRACE(" _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %" 1384 B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available()); 1385 1386 if ((segment.flags & TCP_FLAG_PUSH) != 0) 1387 fReceiveQueue.SetPushPointer(); 1388 1389 return fReceiveQueue.Available() > 0; 1390 } 1391 1392 1393 void 1394 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment) 1395 { 1396 fInitialReceiveSequence = segment.sequence; 1397 fFinishReceived = false; 1398 1399 // count the received SYN 1400 segment.sequence++; 1401 1402 fReceiveNext = segment.sequence; 1403 fReceiveQueue.SetInitialSequence(segment.sequence); 1404 1405 if ((fOptions & TCP_NOOPT) == 0) { 1406 if (segment.max_segment_size > 0) 1407 fSendMaxSegmentSize = segment.max_segment_size; 1408 1409 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1410 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1411 fSendWindowShift = segment.window_shift; 1412 } else { 1413 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1414 fReceiveWindowShift = 0; 1415 } 1416 1417 if (segment.options & TCP_HAS_TIMESTAMPS) { 1418 fFlags |= FLAG_OPTION_TIMESTAMP; 1419 fReceivedTimestamp = segment.timestamp_value; 1420 } else 1421 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1422 } 1423 1424 if (fSendMaxSegmentSize > 2190) 1425 fCongestionWindow = 2 * fSendMaxSegmentSize; 1426 else if (fSendMaxSegmentSize > 1095) 1427 fCongestionWindow = 3 * fSendMaxSegmentSize; 1428 else 1429 fCongestionWindow = 4 * fSendMaxSegmentSize; 1430 1431 fSendMaxSegments = fCongestionWindow / fSendMaxSegmentSize; 1432 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1433 } 1434 1435 1436 bool 1437 TCPEndpoint::_ShouldReceive() const 1438 { 1439 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1440 return false; 1441 1442 return fState == ESTABLISHED || fState == FINISH_SENT 1443 || fState == FINISH_ACKNOWLEDGED; 1444 } 1445 1446 1447 int32 1448 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment, 1449 net_buffer* buffer) 1450 { 1451 MutexLocker _(fLock); 1452 1453 // TODO error checking 1454 ProtocolSocket::Open(); 1455 1456 fState = SYNCHRONIZE_RECEIVED; 1457 T(Spawn(parent, this)); 1458 1459 fManager = parent->fManager; 1460 1461 LocalAddress().SetTo(buffer->destination); 1462 PeerAddress().SetTo(buffer->source); 1463 1464 TRACE("Spawn()"); 1465 1466 // TODO: proper error handling! 1467 if (fManager->BindChild(this) != B_OK) { 1468 T(Error(this, "binding failed", __LINE__)); 1469 return DROP; 1470 } 1471 if (_PrepareSendPath(*PeerAddress()) != B_OK) { 1472 T(Error(this, "prepare send faild", __LINE__)); 1473 return DROP; 1474 } 1475 1476 fOptions = parent->fOptions; 1477 fAcceptSemaphore = parent->fAcceptSemaphore; 1478 1479 _PrepareReceivePath(segment); 1480 1481 // send SYN+ACK 1482 if (_SendQueued() != B_OK) { 1483 T(Error(this, "sending failed", __LINE__)); 1484 return DROP; 1485 } 1486 1487 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1488 // we handled this flag now, it must not be set for further processing 1489 1490 return _Receive(segment, buffer); 1491 } 1492 1493 1494 int32 1495 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer) 1496 { 1497 TRACE("ListenReceive()"); 1498 1499 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 1500 // but the error behaviour differs 1501 if (segment.flags & TCP_FLAG_RESET) 1502 return DROP; 1503 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1504 return DROP | RESET; 1505 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1506 return DROP; 1507 1508 // TODO: drop broadcast/multicast 1509 1510 // spawn new endpoint for accept() 1511 net_socket* newSocket; 1512 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) { 1513 T(Error(this, "spawning failed", __LINE__)); 1514 return DROP; 1515 } 1516 1517 return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this, 1518 segment, buffer); 1519 } 1520 1521 1522 int32 1523 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, 1524 net_buffer *buffer) 1525 { 1526 TRACE("_SynchronizeSentReceive()"); 1527 1528 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1529 && (fInitialSendSequence >= segment.acknowledge 1530 || fSendMax < segment.acknowledge)) 1531 return DROP | RESET; 1532 1533 if (segment.flags & TCP_FLAG_RESET) { 1534 _HandleReset(ECONNREFUSED); 1535 return DROP; 1536 } 1537 1538 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1539 return DROP; 1540 1541 fSendUnacknowledged = segment.acknowledge; 1542 _PrepareReceivePath(segment); 1543 1544 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 1545 _MarkEstablished(); 1546 } else { 1547 // simultaneous open 1548 fState = SYNCHRONIZE_RECEIVED; 1549 T(State(this)); 1550 } 1551 1552 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1553 // we handled this flag now, it must not be set for further processing 1554 1555 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 1556 } 1557 1558 1559 int32 1560 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer) 1561 { 1562 // PAWS processing takes precedence over regular TCP acceptability check 1563 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0 && (segment.flags & TCP_FLAG_RESET) == 0) { 1564 if ((segment.options & TCP_HAS_TIMESTAMPS) == 0) 1565 return DROP; 1566 if ((int32)(fReceivedTimestamp - segment.timestamp_value) > 0 1567 && (fReceivedTimestamp - segment.timestamp_value) <= INT32_MAX) 1568 return DROP | IMMEDIATE_ACKNOWLEDGE; 1569 } 1570 1571 uint32 advertisedWindow = (uint32)segment.advertised_window 1572 << fSendWindowShift; 1573 size_t segmentLength = buffer->size; 1574 1575 // First, handle the most common case for uni-directional data transfer 1576 // (known as header prediction - the segment must not change the window, 1577 // and must be the expected sequence, and contain no control flags) 1578 1579 if (fState == ESTABLISHED 1580 && segment.AcknowledgeOnly() 1581 && fReceiveNext == segment.sequence 1582 && advertisedWindow > 0 && advertisedWindow == fSendWindow 1583 && fSendNext == fSendMax) { 1584 _UpdateTimestamps(segment, segmentLength); 1585 1586 if (segmentLength == 0) { 1587 // this is a pure acknowledge segment - we're on the sending end 1588 if (fSendUnacknowledged < segment.acknowledge 1589 && fSendMax >= segment.acknowledge) { 1590 _Acknowledged(segment); 1591 return DROP; 1592 } 1593 } else if (segment.acknowledge == fSendUnacknowledged 1594 && fReceiveQueue.IsContiguous() 1595 && fReceiveQueue.Free() >= segmentLength 1596 && (fFlags & FLAG_NO_RECEIVE) == 0) { 1597 if (_AddData(segment, buffer)) 1598 _NotifyReader(); 1599 1600 return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0 1601 ? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1602 } 1603 } 1604 1605 // The fast path was not applicable, so we continue with the standard 1606 // processing of the incoming segment 1607 1608 ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN); 1609 1610 if (fState != CLOSED && fState != TIME_WAIT) { 1611 // Check sequence number 1612 if (!segment_in_sequence(segment, segmentLength, fReceiveNext, 1613 fReceiveWindow)) { 1614 TRACE(" Receive(): segment out of window, next: %" B_PRIu32 1615 " wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow); 1616 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1617 // TODO: this doesn't look right - review! 1618 return DROP; 1619 } 1620 return DROP | IMMEDIATE_ACKNOWLEDGE; 1621 } 1622 } 1623 1624 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1625 // Is this a valid reset? 1626 // We generally ignore resets in time wait state (see RFC 1337) 1627 if (fLastAcknowledgeSent <= segment.sequence 1628 && tcp_sequence(segment.sequence) < (fLastAcknowledgeSent 1629 + fReceiveWindow) 1630 && fState != TIME_WAIT) { 1631 status_t error; 1632 if (fState == SYNCHRONIZE_RECEIVED) 1633 error = ECONNREFUSED; 1634 else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1635 error = ENOTCONN; 1636 else 1637 error = ECONNRESET; 1638 1639 _HandleReset(error); 1640 } 1641 1642 return DROP; 1643 } 1644 1645 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1646 || (fState == SYNCHRONIZE_RECEIVED 1647 && (fInitialReceiveSequence > segment.sequence 1648 || ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1649 && (fSendUnacknowledged > segment.acknowledge 1650 || fSendMax < segment.acknowledge))))) { 1651 // reset the connection - either the initial SYN was faulty, or we 1652 // received a SYN within the data stream 1653 return DROP | RESET; 1654 } 1655 1656 // TODO: Check this! Why do we advertize a window outside of what we should 1657 // buffer? 1658 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1659 // the window must not shrink 1660 1661 // trim buffer to be within the receive window 1662 int32 drop = (int32)(fReceiveNext - segment.sequence).Number(); 1663 if (drop > 0) { 1664 if ((uint32)drop > buffer->size 1665 || ((uint32)drop == buffer->size 1666 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1667 // don't accidently remove a FIN we shouldn't remove 1668 segment.flags &= ~TCP_FLAG_FINISH; 1669 drop = buffer->size; 1670 } 1671 1672 // remove duplicate data at the start 1673 TRACE("* remove %" B_PRId32 " bytes from the start", drop); 1674 gBufferModule->remove_header(buffer, drop); 1675 segment.sequence += drop; 1676 } 1677 1678 int32 action = KEEP; 1679 1680 // immediately acknowledge out-of-order segment to trigger fast-retransmit at the sender 1681 if (drop != 0) 1682 action |= IMMEDIATE_ACKNOWLEDGE; 1683 1684 drop = (int32)(segment.sequence + buffer->size 1685 - (fReceiveNext + fReceiveWindow)).Number(); 1686 if (drop > 0) { 1687 // remove data exceeding our window 1688 if ((uint32)drop >= buffer->size) { 1689 // if we can accept data, or the segment is not what we'd expect, 1690 // drop the segment (an immediate acknowledge is always triggered) 1691 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1692 return DROP | IMMEDIATE_ACKNOWLEDGE; 1693 1694 action |= IMMEDIATE_ACKNOWLEDGE; 1695 } 1696 1697 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1698 // we need to remove the finish, too, as part of the data 1699 drop--; 1700 } 1701 1702 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1703 TRACE("* remove %" B_PRId32 " bytes from the end", drop); 1704 gBufferModule->remove_trailer(buffer, drop); 1705 } 1706 1707 #ifdef TRACE_TCP 1708 if (advertisedWindow > fSendWindow) { 1709 TRACE(" Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32, 1710 fSendWindow, advertisedWindow); 1711 } 1712 #endif 1713 1714 if (advertisedWindow > fSendWindow) 1715 action |= IMMEDIATE_ACKNOWLEDGE; 1716 1717 fSendWindow = advertisedWindow; 1718 if (advertisedWindow > fSendMaxWindow) 1719 fSendMaxWindow = advertisedWindow; 1720 1721 // Then look at the acknowledgement for any updates 1722 1723 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1724 // process acknowledged data 1725 if (fState == SYNCHRONIZE_RECEIVED) 1726 _MarkEstablished(); 1727 1728 if (fSendMax < segment.acknowledge) 1729 return DROP | IMMEDIATE_ACKNOWLEDGE; 1730 1731 if (segment.acknowledge == fSendUnacknowledged) { 1732 if (buffer->size == 0 && advertisedWindow == fSendWindow 1733 && (segment.flags & TCP_FLAG_FINISH) == 0 && fSendUnacknowledged != fSendMax) { 1734 TRACE("Receive(): duplicate ack!"); 1735 _DuplicateAcknowledge(segment); 1736 } 1737 } else if (segment.acknowledge < fSendUnacknowledged) { 1738 return DROP; 1739 } else { 1740 // this segment acknowledges in flight data 1741 1742 if (fDuplicateAcknowledgeCount >= 3) { 1743 // deflate the window. 1744 if (segment.acknowledge > fRecover) { 1745 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 1746 fCongestionWindow = min_c(fSlowStartThreshold, 1747 max_c(flightSize, fSendMaxSegmentSize) + fSendMaxSegmentSize); 1748 fFlags &= ~FLAG_RECOVERY; 1749 } 1750 } 1751 1752 if (fSendMax == segment.acknowledge) 1753 TRACE("Receive(): all inflight data ack'd!"); 1754 1755 if (segment.acknowledge > fSendQueue.LastSequence() 1756 && fState > ESTABLISHED) { 1757 TRACE("Receive(): FIN has been acknowledged!"); 1758 1759 switch (fState) { 1760 case FINISH_SENT: 1761 fState = FINISH_ACKNOWLEDGED; 1762 T(State(this)); 1763 break; 1764 case CLOSING: 1765 fState = TIME_WAIT; 1766 T(State(this)); 1767 _EnterTimeWait(); 1768 return DROP; 1769 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1770 _Close(); 1771 break; 1772 1773 default: 1774 break; 1775 } 1776 } 1777 1778 if (fState != CLOSED) 1779 _Acknowledged(segment); 1780 } 1781 } 1782 1783 if (segment.flags & TCP_FLAG_URGENT) { 1784 if (fState == ESTABLISHED || fState == FINISH_SENT 1785 || fState == FINISH_ACKNOWLEDGED) { 1786 // TODO: Handle urgent data: 1787 // - RCV.UP <- max(RCV.UP, SEG.UP) 1788 // - signal the user that urgent data is available (SIGURG) 1789 } 1790 } 1791 1792 bool notify = false; 1793 1794 // The buffer may be freed if its data is added to the queue, so cache 1795 // the size as we still need it later. 1796 uint32 bufferSize = buffer->size; 1797 1798 if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0) 1799 && _ShouldReceive()) 1800 notify = _AddData(segment, buffer); 1801 else { 1802 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1803 fReceiveNext += buffer->size; 1804 1805 action = (action & ~KEEP) | DROP; 1806 } 1807 1808 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1809 segmentLength++; 1810 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1811 TRACE("Receive(): peer is finishing connection!"); 1812 fReceiveNext++; 1813 notify = true; 1814 1815 // FIN implies PUSH 1816 fReceiveQueue.SetPushPointer(); 1817 1818 // we'll reply immediately to the FIN if we are not 1819 // transitioning to TIME WAIT so we immediatly ACK it. 1820 action |= IMMEDIATE_ACKNOWLEDGE; 1821 1822 // other side is closing connection; change states 1823 switch (fState) { 1824 case ESTABLISHED: 1825 case SYNCHRONIZE_RECEIVED: 1826 fState = FINISH_RECEIVED; 1827 T(State(this)); 1828 break; 1829 case FINISH_SENT: 1830 // simultaneous close 1831 fState = CLOSING; 1832 T(State(this)); 1833 break; 1834 case FINISH_ACKNOWLEDGED: 1835 fState = TIME_WAIT; 1836 T(State(this)); 1837 _EnterTimeWait(); 1838 break; 1839 case TIME_WAIT: 1840 _UpdateTimeWait(); 1841 break; 1842 1843 default: 1844 break; 1845 } 1846 } 1847 } 1848 1849 if (notify) 1850 _NotifyReader(); 1851 1852 if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1853 action |= ACKNOWLEDGE; 1854 1855 _UpdateTimestamps(segment, segmentLength); 1856 1857 TRACE("Receive() Action %" B_PRId32, action); 1858 1859 return action; 1860 } 1861 1862 1863 int32 1864 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer) 1865 { 1866 MutexLocker locker(fLock); 1867 1868 TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s " 1869 "to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 1870 ", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source), 1871 PrintAddress(buffer->destination), segment.flags, segment.sequence, 1872 segment.acknowledge, 1873 (uint32)segment.advertised_window << fSendWindowShift); 1874 T(Receive(this, segment, 1875 (uint32)segment.advertised_window << fSendWindowShift, buffer)); 1876 int32 segmentAction = DROP; 1877 1878 switch (fState) { 1879 case LISTEN: 1880 segmentAction = _ListenReceive(segment, buffer); 1881 break; 1882 1883 case SYNCHRONIZE_SENT: 1884 segmentAction = _SynchronizeSentReceive(segment, buffer); 1885 break; 1886 1887 case SYNCHRONIZE_RECEIVED: 1888 case ESTABLISHED: 1889 case FINISH_RECEIVED: 1890 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1891 case FINISH_SENT: 1892 case FINISH_ACKNOWLEDGED: 1893 case CLOSING: 1894 case TIME_WAIT: 1895 case CLOSED: 1896 segmentAction = _Receive(segment, buffer); 1897 break; 1898 } 1899 1900 // process acknowledge action as asked for by the *Receive() method 1901 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 1902 SendAcknowledge(true); 1903 else if (segmentAction & ACKNOWLEDGE) 1904 DelayedAcknowledge(); 1905 1906 if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) 1907 == (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) { 1908 locker.Unlock(); 1909 gSocketModule->release_socket(socket); 1910 } 1911 1912 return segmentAction; 1913 } 1914 1915 1916 // #pragma mark - send 1917 1918 1919 inline uint8 1920 TCPEndpoint::_CurrentFlags() 1921 { 1922 // we don't set FLAG_FINISH here, instead we do it 1923 // conditionally below depending if we are sending 1924 // the last bytes of the send queue. 1925 1926 switch (fState) { 1927 case CLOSED: 1928 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1929 1930 case SYNCHRONIZE_SENT: 1931 return TCP_FLAG_SYNCHRONIZE; 1932 case SYNCHRONIZE_RECEIVED: 1933 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1934 1935 case ESTABLISHED: 1936 case FINISH_RECEIVED: 1937 case FINISH_ACKNOWLEDGED: 1938 case TIME_WAIT: 1939 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1940 case FINISH_SENT: 1941 case CLOSING: 1942 return TCP_FLAG_ACKNOWLEDGE; 1943 1944 default: 1945 return 0; 1946 } 1947 } 1948 1949 1950 inline bool 1951 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length, 1952 uint32 segmentMaxSize, uint32 flightSize) 1953 { 1954 if (fState == ESTABLISHED && fSendMaxSegments == 0) 1955 return false; 1956 1957 if (length > 0) { 1958 // Avoid the silly window syndrome - we only send a segment in case: 1959 // - we have a full segment to send, or 1960 // - we're at the end of our buffer queue, or 1961 // - the buffer is at least larger than half of the maximum send window, 1962 // or 1963 // - we're retransmitting data 1964 if (length == segmentMaxSize 1965 || (fOptions & TCP_NODELAY) != 0 1966 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 1967 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 1968 return true; 1969 } 1970 1971 // check if we need to send a window update to the peer 1972 if (segment.advertised_window > 0) { 1973 // correct the window to take into account what already has been advertised 1974 uint32 window = (segment.advertised_window << fReceiveWindowShift) 1975 - (fReceiveMaxAdvertised - fReceiveNext).Number(); 1976 1977 // if we can advertise a window larger than twice the maximum segment 1978 // size, or half the maximum buffer size we send a window update 1979 if (window >= (fReceiveMaxSegmentSize << 1) 1980 || window >= (socket->receive.buffer_size >> 1)) 1981 return true; 1982 } 1983 1984 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH 1985 | TCP_FLAG_RESET)) != 0) 1986 return true; 1987 1988 // We do have urgent data pending 1989 if (fSendUrgentOffset > fSendNext) 1990 return true; 1991 1992 // there is no reason to send a segment just now 1993 return false; 1994 } 1995 1996 1997 status_t 1998 TCPEndpoint::_SendQueued(bool force) 1999 { 2000 return _SendQueued(force, fSendWindow); 2001 } 2002 2003 2004 /*! Sends one or more TCP segments with the data waiting in the queue, or some 2005 specific flags that need to be sent. 2006 */ 2007 status_t 2008 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow) 2009 { 2010 if (fRoute == NULL) 2011 return B_ERROR; 2012 2013 // in passive state? 2014 if (fState == LISTEN) 2015 return B_ERROR; 2016 2017 tcp_segment_header segment(_CurrentFlags()); 2018 2019 if ((fOptions & TCP_NOOPT) == 0) { 2020 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) { 2021 segment.options |= TCP_HAS_TIMESTAMPS; 2022 segment.timestamp_reply = fReceivedTimestamp; 2023 segment.timestamp_value = tcp_now(); 2024 } 2025 2026 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 2027 && fSendNext == fInitialSendSequence) { 2028 // add connection establishment options 2029 segment.max_segment_size = fReceiveMaxSegmentSize; 2030 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 2031 segment.options |= TCP_HAS_WINDOW_SCALE; 2032 segment.window_shift = fReceiveWindowShift; 2033 } 2034 } 2035 } 2036 2037 size_t availableBytes = fReceiveQueue.Free(); 2038 // window size must remain same for duplicate acknowledgements 2039 if (!fReceiveQueue.IsContiguous()) 2040 availableBytes = (fReceiveMaxAdvertised - fReceiveNext).Number(); 2041 2042 if (fFlags & FLAG_OPTION_WINDOW_SCALE) 2043 segment.advertised_window = availableBytes >> fReceiveWindowShift; 2044 else 2045 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes); 2046 2047 segment.acknowledge = fReceiveNext.Number(); 2048 2049 // Process urgent data 2050 if (fSendUrgentOffset > fSendNext) { 2051 segment.flags |= TCP_FLAG_URGENT; 2052 segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number(); 2053 } else { 2054 fSendUrgentOffset = fSendUnacknowledged.Number(); 2055 // Keep urgent offset updated, so that it doesn't reach into our 2056 // send window on overlap 2057 segment.urgent_offset = 0; 2058 } 2059 2060 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 2061 sendWindow = fCongestionWindow; 2062 2063 // fSendUnacknowledged 2064 // | fSendNext fSendMax 2065 // | | | 2066 // v v v 2067 // ----------------------------------- 2068 // | effective window | 2069 // ----------------------------------- 2070 2071 // Flight size represents the window of data which is currently in the 2072 // ether. We should never send data such as the flight size becomes larger 2073 // than the effective window. Note however that the effective window may be 2074 // reduced (by congestion for instance), so at some point in time flight 2075 // size may be larger than the currently calculated window. 2076 2077 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 2078 uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number(); 2079 2080 if (consumedWindow > sendWindow) { 2081 sendWindow = 0; 2082 // TODO: enter persist state? try to get a window update. 2083 } else 2084 sendWindow -= consumedWindow; 2085 2086 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 2087 bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged; 2088 bool retransmit = fSendNext < fSendMax; 2089 2090 if (fDuplicateAcknowledgeCount != 0) { 2091 // send at most 1 SMSS of data when under limited transmit, fast transmit/recovery 2092 length = min_c(length, fSendMaxSegmentSize); 2093 } 2094 2095 do { 2096 uint32 segmentMaxSize = fSendMaxSegmentSize 2097 - tcp_options_length(segment); 2098 uint32 segmentLength = min_c(length, segmentMaxSize); 2099 2100 if (fSendNext + segmentLength == fSendQueue.LastSequence() && !force) { 2101 if (state_needs_finish(fState)) 2102 segment.flags |= TCP_FLAG_FINISH; 2103 if (length > 0) 2104 segment.flags |= TCP_FLAG_PUSH; 2105 } 2106 2107 // Determine if we should really send this segment 2108 if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength, 2109 segmentMaxSize, flightSize)) { 2110 if (fSendQueue.Available() 2111 && !gStackModule->is_timer_active(&fPersistTimer) 2112 && !gStackModule->is_timer_active(&fRetransmitTimer)) 2113 _StartPersistTimer(); 2114 break; 2115 } 2116 2117 net_buffer *buffer = gBufferModule->create(256); 2118 if (buffer == NULL) 2119 return B_NO_MEMORY; 2120 2121 status_t status = B_OK; 2122 if (segmentLength > 0) 2123 status = fSendQueue.Get(buffer, fSendNext, segmentLength); 2124 if (status < B_OK) { 2125 gBufferModule->free(buffer); 2126 return status; 2127 } 2128 2129 LocalAddress().CopyTo(buffer->source); 2130 PeerAddress().CopyTo(buffer->destination); 2131 2132 uint32 size = buffer->size; 2133 segment.sequence = fSendNext.Number(); 2134 2135 TRACE("SendQueued(): buffer %p (%" B_PRIu32 " bytes) address %s to " 2136 "%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 2137 ", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32 2138 ", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32, 2139 buffer, buffer->size, PrintAddress(buffer->source), 2140 PrintAddress(buffer->destination), segment.flags, segment.sequence, 2141 segment.acknowledge, segment.advertised_window, 2142 fCongestionWindow, fSlowStartThreshold, segmentLength, 2143 fSendQueue.FirstSequence().Number(), 2144 fSendQueue.LastSequence().Number()); 2145 T(Send(this, segment, buffer, fSendQueue.FirstSequence(), 2146 fSendQueue.LastSequence())); 2147 2148 PROBE(buffer, sendWindow); 2149 sendWindow -= buffer->size; 2150 2151 status = add_tcp_header(AddressModule(), segment, buffer); 2152 if (status != B_OK) { 2153 gBufferModule->free(buffer); 2154 return status; 2155 } 2156 2157 // Update send status - we need to do this before we send the data 2158 // for local connections as the answer is directly handled 2159 2160 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 2161 segment.options &= ~TCP_HAS_WINDOW_SCALE; 2162 segment.max_segment_size = 0; 2163 size++; 2164 } 2165 2166 if (segment.flags & TCP_FLAG_FINISH) 2167 size++; 2168 2169 uint32 sendMax = fSendMax.Number(); 2170 fSendNext += size; 2171 if (fSendMax < fSendNext) 2172 fSendMax = fSendNext; 2173 2174 fReceiveMaxAdvertised = fReceiveNext 2175 + ((uint32)segment.advertised_window << fReceiveWindowShift); 2176 2177 if (segmentLength != 0 && fState == ESTABLISHED) 2178 --fSendMaxSegments; 2179 2180 status = next->module->send_routed_data(next, fRoute, buffer); 2181 if (status < B_OK) { 2182 gBufferModule->free(buffer); 2183 2184 fSendNext = segment.sequence; 2185 fSendMax = sendMax; 2186 // restore send status 2187 return status; 2188 } 2189 2190 if (fSendTime == 0 && !retransmit 2191 && (segmentLength != 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) !=0)) { 2192 fSendTime = tcp_now(); 2193 fRoundTripStartSequence = segment.sequence; 2194 } 2195 2196 if (shouldStartRetransmitTimer && size > 0) { 2197 TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME, 2198 fRetransmitTimeout); 2199 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2200 T(TimerSet(this, "retransmit", fRetransmitTimeout)); 2201 shouldStartRetransmitTimer = false; 2202 } 2203 2204 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 2205 fLastAcknowledgeSent = segment.acknowledge; 2206 2207 length -= segmentLength; 2208 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET 2209 | TCP_FLAG_FINISH); 2210 2211 if (retransmit) 2212 break; 2213 2214 } while (length > 0); 2215 2216 return B_OK; 2217 } 2218 2219 2220 int 2221 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const 2222 { 2223 return next->module->get_mtu(next, address) - sizeof(tcp_header); 2224 } 2225 2226 2227 status_t 2228 TCPEndpoint::_PrepareSendPath(const sockaddr* peer) 2229 { 2230 if (fRoute == NULL) { 2231 fRoute = gDatalinkModule->get_route(Domain(), peer); 2232 if (fRoute == NULL) 2233 return ENETUNREACH; 2234 2235 if ((fRoute->flags & RTF_LOCAL) != 0) 2236 fFlags |= FLAG_LOCAL; 2237 } 2238 2239 // make sure connection does not already exist 2240 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 2241 fRoute->interface_address->local); 2242 if (status < B_OK) 2243 return status; 2244 2245 fInitialSendSequence = system_time() >> 4; 2246 fSendNext = fInitialSendSequence; 2247 fSendUnacknowledged = fInitialSendSequence; 2248 fSendMax = fInitialSendSequence; 2249 fSendUrgentOffset = fInitialSendSequence; 2250 fRecover = fInitialSendSequence.Number(); 2251 2252 // we are counting the SYN here 2253 fSendQueue.SetInitialSequence(fSendNext + 1); 2254 2255 fReceiveMaxSegmentSize = _MaxSegmentSize(peer); 2256 2257 // Compute the window shift we advertise to our peer - if it doesn't support 2258 // this option, this will be reset to 0 (when its SYN is received) 2259 fReceiveWindowShift = 0; 2260 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 2261 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 2262 fReceiveWindowShift++; 2263 } 2264 2265 return B_OK; 2266 } 2267 2268 2269 void 2270 TCPEndpoint::_Acknowledged(tcp_segment_header& segment) 2271 { 2272 TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %" 2273 B_PRIu32 "; max %" B_PRIu32, segment.acknowledge, 2274 fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number()); 2275 2276 ASSERT(fSendUnacknowledged <= segment.acknowledge); 2277 2278 if (fSendUnacknowledged < segment.acknowledge) { 2279 fSendQueue.RemoveUntil(segment.acknowledge); 2280 2281 uint32 bytesAcknowledged = segment.acknowledge - fSendUnacknowledged.Number(); 2282 fPreviousHighestAcknowledge = fSendUnacknowledged; 2283 fSendUnacknowledged = segment.acknowledge; 2284 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 2285 int32 expectedSamples = flightSize / (fSendMaxSegmentSize << 1); 2286 2287 if (fPreviousHighestAcknowledge > fSendUnacknowledged) { 2288 // need to update the recover variable upon a sequence wraparound 2289 fRecover = segment.acknowledge - 1; 2290 } 2291 2292 // the acknowledgment of the SYN/ACK MUST NOT increase the size of the congestion window 2293 if (fSendUnacknowledged != fInitialSendSequence) { 2294 if (fCongestionWindow < fSlowStartThreshold) 2295 fCongestionWindow += min_c(bytesAcknowledged, fSendMaxSegmentSize); 2296 else { 2297 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 2298 2299 if (increment < fCongestionWindow) 2300 increment = 1; 2301 else 2302 increment /= fCongestionWindow; 2303 2304 fCongestionWindow += increment; 2305 } 2306 2307 fSendMaxSegments = UINT32_MAX; 2308 } 2309 2310 if ((fFlags & FLAG_RECOVERY) != 0) { 2311 fSendNext = fSendUnacknowledged; 2312 _SendQueued(); 2313 fCongestionWindow -= bytesAcknowledged; 2314 2315 if (bytesAcknowledged > fSendMaxSegmentSize) 2316 fCongestionWindow += fSendMaxSegmentSize; 2317 2318 fSendNext = fSendMax; 2319 } else 2320 fDuplicateAcknowledgeCount = 0; 2321 2322 if (fSendNext < fSendUnacknowledged) 2323 fSendNext = fSendUnacknowledged; 2324 2325 if (fFlags & FLAG_OPTION_TIMESTAMP) { 2326 _UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply), 2327 expectedSamples > 0 ? expectedSamples : 1); 2328 } else if (fSendTime != 0 && fRoundTripStartSequence < segment.acknowledge) { 2329 _UpdateRoundTripTime(tcp_diff_timestamp(fSendTime), 1); 2330 fSendTime = 0; 2331 } 2332 2333 if (fSendUnacknowledged == fSendMax) { 2334 TRACE("all acknowledged, cancelling retransmission timer."); 2335 gStackModule->cancel_timer(&fRetransmitTimer); 2336 T(TimerSet(this, "retransmit", -1)); 2337 } else { 2338 TRACE("data acknowledged, resetting retransmission timer to: %" 2339 B_PRIdBIGTIME, fRetransmitTimeout); 2340 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2341 T(TimerSet(this, "retransmit", fRetransmitTimeout)); 2342 } 2343 2344 if (is_writable(fState)) { 2345 // notify threads waiting on the socket to become writable again 2346 fSendCondition.NotifyAll(); 2347 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free()); 2348 } 2349 } 2350 2351 // if there is data left to be sent, send it now 2352 if (fSendQueue.Used() > 0) 2353 _SendQueued(); 2354 } 2355 2356 2357 void 2358 TCPEndpoint::_Retransmit() 2359 { 2360 TRACE("Retransmit()"); 2361 2362 if (fState < ESTABLISHED) { 2363 fRetransmitTimeout = TCP_SYN_RETRANSMIT_TIMEOUT; 2364 fCongestionWindow = fSendMaxSegmentSize; 2365 } else { 2366 _ResetSlowStart(); 2367 fDuplicateAcknowledgeCount = 0; 2368 // Do exponential back off of the retransmit timeout 2369 fRetransmitTimeout *= 2; 2370 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT) 2371 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT; 2372 } 2373 2374 fSendNext = fSendUnacknowledged; 2375 _SendQueued(); 2376 2377 fRecover = fSendNext.Number() - 1; 2378 if ((fFlags & FLAG_RECOVERY) != 0) 2379 fFlags &= ~FLAG_RECOVERY; 2380 } 2381 2382 2383 void 2384 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime, int32 expectedSamples) 2385 { 2386 if (fSmoothedRoundTripTime == 0) { 2387 fSmoothedRoundTripTime = roundTripTime; 2388 fRoundTripVariation = roundTripTime / 2; 2389 fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4)) 2390 * kTimestampFactor; 2391 } else { 2392 int32 delta = fSmoothedRoundTripTime - roundTripTime; 2393 if (delta < 0) 2394 delta = -delta; 2395 fRoundTripVariation += (delta - fRoundTripVariation) / (expectedSamples * 4); 2396 fSmoothedRoundTripTime += (roundTripTime - fSmoothedRoundTripTime) / (expectedSamples * 8); 2397 fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4)) 2398 * kTimestampFactor; 2399 } 2400 2401 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT) 2402 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT; 2403 2404 if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT) 2405 fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT; 2406 2407 TRACE(" RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)", 2408 fRetransmitTimeout, roundTripTime); 2409 } 2410 2411 2412 void 2413 TCPEndpoint::_ResetSlowStart() 2414 { 2415 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2, 2416 2 * fSendMaxSegmentSize); 2417 fCongestionWindow = fSendMaxSegmentSize; 2418 } 2419 2420 2421 // #pragma mark - timer 2422 2423 2424 /*static*/ void 2425 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint) 2426 { 2427 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2428 T(TimerTriggered(endpoint, "retransmit")); 2429 2430 MutexLocker locker(endpoint->fLock); 2431 if (!locker.IsLocked() || gStackModule->is_timer_active(timer)) 2432 return; 2433 2434 endpoint->_Retransmit(); 2435 } 2436 2437 2438 /*static*/ void 2439 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint) 2440 { 2441 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2442 T(TimerTriggered(endpoint, "persist")); 2443 2444 MutexLocker locker(endpoint->fLock); 2445 if (!locker.IsLocked()) 2446 return; 2447 2448 // the timer might not have been canceled early enough 2449 if (endpoint->State() == CLOSED) 2450 return; 2451 2452 endpoint->_SendQueued(true); 2453 } 2454 2455 2456 /*static*/ void 2457 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint) 2458 { 2459 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2460 T(TimerTriggered(endpoint, "delayed ack")); 2461 2462 MutexLocker locker(endpoint->fLock); 2463 if (!locker.IsLocked()) 2464 return; 2465 2466 // the timer might not have been canceled early enough 2467 if (endpoint->State() == CLOSED) 2468 return; 2469 2470 endpoint->SendAcknowledge(true); 2471 } 2472 2473 2474 /*static*/ void 2475 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint) 2476 { 2477 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2478 T(TimerTriggered(endpoint, "time-wait")); 2479 2480 MutexLocker locker(endpoint->fLock); 2481 if (!locker.IsLocked()) 2482 return; 2483 2484 if ((endpoint->fFlags & FLAG_CLOSED) == 0) { 2485 endpoint->fFlags |= FLAG_DELETE_ON_CLOSE; 2486 return; 2487 } 2488 2489 locker.Unlock(); 2490 2491 gSocketModule->release_socket(endpoint->socket); 2492 } 2493 2494 2495 /*static*/ status_t 2496 TCPEndpoint::_WaitForCondition(ConditionVariable& condition, 2497 MutexLocker& locker, bigtime_t timeout) 2498 { 2499 ConditionVariableEntry entry; 2500 condition.Add(&entry); 2501 2502 locker.Unlock(); 2503 status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout); 2504 locker.Lock(); 2505 2506 return result; 2507 } 2508 2509 2510 // #pragma mark - 2511 2512 2513 void 2514 TCPEndpoint::Dump() const 2515 { 2516 kprintf("TCP endpoint %p\n", this); 2517 kprintf(" state: %s\n", name_for_state(fState)); 2518 kprintf(" flags: 0x%" B_PRIx32 "\n", fFlags); 2519 #if KDEBUG 2520 kprintf(" lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder); 2521 #endif 2522 kprintf(" accept sem: %" B_PRId32 "\n", fAcceptSemaphore); 2523 kprintf(" options: 0x%" B_PRIx32 "\n", (uint32)fOptions); 2524 kprintf(" send\n"); 2525 kprintf(" window shift: %" B_PRIu8 "\n", fSendWindowShift); 2526 kprintf(" unacknowledged: %" B_PRIu32 "\n", 2527 fSendUnacknowledged.Number()); 2528 kprintf(" next: %" B_PRIu32 "\n", fSendNext.Number()); 2529 kprintf(" max: %" B_PRIu32 "\n", fSendMax.Number()); 2530 kprintf(" urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number()); 2531 kprintf(" window: %" B_PRIu32 "\n", fSendWindow); 2532 kprintf(" max window: %" B_PRIu32 "\n", fSendMaxWindow); 2533 kprintf(" max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize); 2534 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(), 2535 fSendQueue.Size()); 2536 #if DEBUG_BUFFER_QUEUE 2537 fSendQueue.Dump(); 2538 #endif 2539 kprintf(" last acknowledge sent: %" B_PRIu32 "\n", 2540 fLastAcknowledgeSent.Number()); 2541 kprintf(" initial sequence: %" B_PRIu32 "\n", 2542 fInitialSendSequence.Number()); 2543 kprintf(" receive\n"); 2544 kprintf(" window shift: %" B_PRIu8 "\n", fReceiveWindowShift); 2545 kprintf(" next: %" B_PRIu32 "\n", fReceiveNext.Number()); 2546 kprintf(" max advertised: %" B_PRIu32 "\n", 2547 fReceiveMaxAdvertised.Number()); 2548 kprintf(" window: %" B_PRIu32 "\n", fReceiveWindow); 2549 kprintf(" max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize); 2550 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", 2551 fReceiveQueue.Available(), fReceiveQueue.Size()); 2552 #if DEBUG_BUFFER_QUEUE 2553 fReceiveQueue.Dump(); 2554 #endif 2555 kprintf(" initial sequence: %" B_PRIu32 "\n", 2556 fInitialReceiveSequence.Number()); 2557 kprintf(" duplicate acknowledge count: %" B_PRIu32 "\n", 2558 fDuplicateAcknowledgeCount); 2559 kprintf(" smoothed round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n", 2560 fSmoothedRoundTripTime, fRoundTripVariation); 2561 kprintf(" retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout); 2562 kprintf(" congestion window: %" B_PRIu32 "\n", fCongestionWindow); 2563 kprintf(" slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold); 2564 } 2565 2566